# # # file: edgar-xml.pl # desc: EDGAR XML generation support # # limit scope package EDGAR::XML; use strict; BEGIN { # XML Document Object Model support use XML::DOM; my $iw_root = ($ENV{IW}) ? $ENV{IW} : "/iw"; my $edgar_root = "$iw_root/skulker/edgar/current"; # where we find our local libraries my($libdir) = "$edgar_root/lib"; if ( -d $libdir) { unshift(@INC, $libdir); } # EDGAR Parse types, classifications, and support routines require 'edgar-parse.pl'; # EDGAR generic utility support require 'edgar-util.pl'; } # debug mode? my($debug) = 0; # SEC EDGAR to XML DTD (for validation) my $DTD = 'http://xml.resource.org/doc/edgar/edgardoc.dtd'; my $DOCTYPE = qq|PUBLIC "-//Blocks//DTD EDGARDOC//EN" "$DTD"|; ## for test/debug - bjb, 26-apr-1999 #my $DTD = 'edgartest.dtd'; #my $DOCTYPE = qq|SYSTEM "$DTD"|; # # generate standard XML header # sub header { my($edgar) = shift; # header we will build and return my($hdr); # XML version $hdr = "" . "\n"; # will this change? $hdr .= "" . "\n\n"; $hdr .= "" . "\n"; $hdr .= transformer($edgar); $hdr; } # # XML transformer info # sub transformer { my($edgar) = shift; my($xml) = ''; # transform date my($date) = `/bin/date +"%d-%h-%Y, %T %Z"`; chomp($date); my($accno) = $edgar->{'accession-number'}; my($cik) = $edgar->{'cik'}; $cik =~ s/^0+//; my($url) = "ftp://ftp.sec.gov/edgar/data"; $url .= "/" . $cik; $url .= "/" . $accno . ".txt"; $xml = < Invisible Worlds, Inc.
1179 N. McDowell Blvd Petaluma California 94954 US edgar\@invisible.net http://invisible.net/
$date $url EoD $xml; } # # generate XML submission # sub submission { my($submission) = shift; my($xml) = ''; $xml; } # # generate standard XML trailer # sub trailer { my($edgar) = shift; my($trlr) = ""; $trlr .= "\n"; $trlr .= stamps2xml($edgar); $trlr .= "\n"; $trlr .= "
" . "\n"; $trlr; } # # encode reserved characters # sub encode { my($foo) = shift; # avoid double encoding & $foo =~ s/\&/\&/g; # order is important with & substitution -- keep it first $foo =~ s/\&/\&/g; $foo =~ s/ - bjb, 26-nov-1999 $foo =~ s/>/\>/g; $foo; } # # decode reserved characters # sub decode { my($foo) = shift; $foo =~ s/\&/\&/g; $foo =~ s/\<//g; $foo; } # # EDGAR XML parsing support # # # parse Invisible Worlds EDGAR XML format document stream # sub parse { my($data) = shift; # parse input XML file into DOM structure my $parser = new XML::DOM::Parser; my $doc; eval { $doc = $parser->parse($data, ErrorContext => 2); }; if ($@) { my $accno = $data; $accno =~ s|^.*(.*?).*$|$1|s; # parse error? warn "EDGAR::XML::parse: $accno: $@\n"; return (); } # extract pieces of edgardoc from DOM tree my %edgardoc = (); %{$edgardoc{'transformer'}} = getTransformer($doc); %{$edgardoc{'submission'}} = getSubmission($doc); %{$edgardoc{'parsed-edgar'}} = getParsedEdgar($doc); @{$edgardoc{'stamps'}} = getStamps($doc); # clean up DOM internals -- REQUIRED to avoid memory leaks $doc->dispose() if ($doc); %edgardoc; } # # parse Invisible Worlds EDGAR XML format document file # sub parsefile { my($file) = shift; # parse input XML file into DOM structure my $parser = new XML::DOM::Parser; my $doc; eval { $doc = $parser->parsefile($file, ErrorContext => 2); }; if ($@) { # parse error? warn "EDGAR::XML::parsefile: $file: $@\n"; return (); } # extract pieces of edgardoc from DOM tree my %edgardoc = (); %{$edgardoc{'transformer'}} = getTransformer($doc); %{$edgardoc{'submission'}} = getSubmission($doc); %{$edgardoc{'parsed-edgar'}} = getParsedEdgar($doc); @{$edgardoc{'stamps'}} = getStamps($doc); # clean up DOM internals -- REQUIRED to avoid memory leaks $doc->dispose() if ($doc); %edgardoc; } # # get transformer element data # sub getTransformer { my($doc) = shift; my %transformer = (); my $tf = $doc->getElementsByTagName("transformer")->item(0); return () unless ($tf); my $tf_org = $tf->getElementsByTagName("organization")->item(0); if ($tf_org) { $transformer{'organization'} = getElementData($tf_org); } my $tf_addr = $tf->getElementsByTagName("address")->item(0); if ($tf_addr) { my $tf_postal = $tf_addr->getElementsByTagName("postal")->item(0); if ($tf_postal) { for my $node ($tf_postal->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); my $val = ''; next if ($name eq '#text'); if ($node->hasChildNodes()) { $val = $node->getFirstChild()->getData(); } else { $val = $node->getData(); } if ($transformer{$name}) { $transformer{$name} .= "|" . $val; } else { $transformer{$name} = $val; } } } } my $tf_email = $tf_addr->getElementsByTagName("email")->item(0); if ($tf_email) { $transformer{'email'} = getElementData($tf_email); } my $tf_uri = $tf_addr->getElementsByTagName("uri")->item(0); if ($tf_uri) { $transformer{'uri'} = getElementData($tf_uri); } my $tf_date = $tf->getElementsByTagName("date")->item(0); if ($tf_date) { $transformer{'date'} = getElementData($tf_date); } my $tf_canon = $tf->getElementsByTagName("canonical")->item(0); if ($tf_canon) { my $tf_canon_uri = $tf_canon->getElementsByTagName("uri")->item(0); if ($tf_canon_uri) { $transformer{'canonical'} = getElementData($tf_canon_uri); } } return %transformer; } # # get stamps element data # sub getStamps { my($doc) = shift; my @stamps = (); my $st = $doc->getElementsByTagName("stamps")->item(0); return () unless ($st); if ($st->hasChildNodes()) { for my $node ($st->getChildNodes()) { my $name = $node->getNodeName(); my $val = ''; next if ($name eq '#text'); if ($node->hasChildNodes()) { $val = $node->getFirstChild()->getData(); } else { $val = $node->getData(); } push(@stamps, {type => $name, data => $val}); } } return @stamps; } # # get SUBMISSION element data # sub getSubmission { my($doc) = shift; # there will be only one my $submission = $doc->getElementsByTagName("SUBMISSION")->item(0); return () unless ($submission); my %submission = (); for my $node ($submission->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); # we only care about the ELEMENTs of the submission next unless ($type == ELEMENT_NODE); # registrant data if ($name =~ /^(FILED-BY|FILER|REPORTING-OWNER|SERIAL-COMPANY|SUBJECT-COMPANY)$/o) { push(@{$submission{'registrants'}}, {getRegistrant($node)} ); # document data } elsif ($name =~ /^DOCUMENT$/o) { push(@{$submission{'documents'}}, {getDocument($node)} ); # submission header data } else { $name =~ tr/A-Z/a-z/; $submission{$name} = getElementData($node); } } return %submission; } # # get parsed.edgar element data # sub getParsedEdgar { my($doc) = shift; # there will be only one my $parsed = $doc->getElementsByTagName("parsed.edgar")->item(0); return () unless ($parsed); my %parsedEdgar = (); for my $node ($parsed->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); # we only care about the ELEMENTs of the submission next unless ($type == ELEMENT_NODE); # parsed.document data if ($name =~ /^parsed\.document$/o) { push(@{$parsedEdgar{'parsed-documents'}}, {getParsedDocument($node)} ); # submission header data } else { $name =~ tr/A-Z/a-z/; $parsedEdgar{$name} = getElementData($node); } } return %parsedEdgar; } # # get DOCUMENT element data # sub getDocument { my($doc) = shift; my %document = (); for my $node ($doc->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); # we only care about the ELEMENTs of the submission next unless ($type == ELEMENT_NODE); # TEXT data? if ($name eq "TEXT") { $name =~ tr/A-Z/a-z/; $document{$name} = $node->toString(); # extract/parse document table elements $document{'tables'} = [ getDocumentTables($node) ]; } else { $name =~ tr/A-Z/a-z/; $document{$name} = getElementData($node); } } return %document; } # # get parsed.document element data # sub getParsedDocument { my($doc) = shift; my %document = (); for my $node ($doc->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); # we only care about the ELEMENTs of the submission next unless ($type == ELEMENT_NODE); # TEXT data? if ($name eq "parsed.form") { $name =~ s/\./\-/g; $document{$name} = $node->toString(); } else { $name =~ tr/A-Z/a-z/; $document{$name} = getElementData($node); } } $document{'parsed-tables'} = { getParsedDocumentTables($doc) }; return %document; } # # get DOCUMENT TABLE element data # sub getDocumentTables { my($doc) = shift; my @tables = (); for my $node ($doc->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); # we only care about the TABLE ELEMENTs of the DOCUMENT next unless ($type == ELEMENT_NODE && $name eq 'TABLE'); # save TABLE data push(@tables, $node->toString()); } return @tables; } # # get DOCUMENT parsed TABLE element data # sub getParsedDocumentTables { my($doc) = shift; # order isn't important for these...we use table id as key my %tables = (); for my $node ($doc->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); # we only care about the TABLE ELEMENTs of the DOCUMENT next unless ($type == ELEMENT_NODE && $name eq 'parsed.table'); my($id) = $node->getAttribute("id"); # save TABLE data $tables{$id} = $node->toString(); } return %tables; } # # get SUBMISSION registrant data # sub getRegistrant { my($doc) = shift; my %registrant = (); # registrant type my $type = $doc->getNodeName(); $type =~ tr/A-Z/a-z/; $registrant{'type'} = $type; for my $node ($doc->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); # we only care about the ELEMENTs of the submission next unless ($type == ELEMENT_NODE); $name =~ tr/A-Z/a-z/; # company data if ($name =~ /^company-data$/o) { $registrant{$name} = {getCompanyData($node)}; # filing values } elsif ($name =~ /^filing-values$/o) { push(@{$registrant{$name}}, {getFilingValues($node)} ); # business address } elsif ($name =~ /^business-address$/o) { $registrant{$name} = {getBusinessAddress($node)}; # mail address } elsif ($name =~ /^mail-address$/o) { $registrant{$name} = {getMailAddress($node)}; # former company data } elsif ($name =~ /^former-company$/o) { push(@{$registrant{$name}}, {getFormerCompanyData($node)} ); # unknown element? } else { ;; } } return %registrant; } # # get company data # sub getCompanyData { my($doc) = shift; my %company = (); for my $node ($doc->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); next unless ($type == ELEMENT_NODE); $name =~ tr/A-Z/a-z/; $company{$name} = getElementData($node); } return %company; } # # get filing values data # sub getFilingValues { my($doc) = shift; my %filing = (); for my $node ($doc->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); next unless ($type == ELEMENT_NODE); $name =~ tr/A-Z/a-z/; $filing{$name} = getElementData($node); } return %filing; } # # get business address # sub getBusinessAddress { my($doc) = shift; my %address = (); for my $node ($doc->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); next unless ($type == ELEMENT_NODE); $name =~ tr/A-Z/a-z/; $address{$name} = getElementData($node); } return %address; } # # get mail address # sub getMailAddress { my($doc) = shift; my %address = (); for my $node ($doc->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); next unless ($type == ELEMENT_NODE); $name =~ tr/A-Z/a-z/; $address{$name} = getElementData($node); } return %address; } # # get former company data # sub getFormerCompanyData { my($doc) = shift; my %company = (); for my $node ($doc->getChildNodes()) { my $name = $node->getNodeName(); my $type = $node->getNodeType(); next unless ($type == ELEMENT_NODE); $name =~ tr/A-Z/a-z/; $company{$name} = getElementData($node); } return %company; } # # get generic element data for specified element ($doc) # sub getElementData { my($doc) = shift; my($data); if ($doc->hasChildNodes()) { $data = $doc->getFirstChild()->getData(); } else { $data = ""; } return $data; } # # generate parsed edgar XML, if appropriate # sub parsedEdgar { my($data) = shift; my($document); my %edgardoc = parse($data); # parsed edgar docs my($parsedDocs) = ''; foreach $document (@{$edgardoc{'submission'}{'documents'}}) { # individual parsed document my($parsedDoc) = ''; # document data my($type, $sequence, $description); $type = $document->{'type'}; $sequence = $document->{'sequence'}; $description = $document->{'description'}; print STDERR "parsedEdgar: type: ($type) sequence: ($sequence)\n" if ($debug > 1); $logger::DEBUG->increment('-name' => $type); # generate parsed form, if we can ## disabled due to broken XML generation - bjb, 04-mar-2000 ## my($parsedForm) = EDGAR::Forms::parse($type, $document->{'text'}); my($parsedForm) = ""; if ($parsedForm) { my($id); $id = "document_" . $sequence . "_type_" . $type; # parsed.form is of type ID and spaces aren't allowed $id =~ s/\s+/_/g; print STDERR "parsedEdgar: id: ($id)\n" if ($debug > 1); $parsedDoc .= "\n"; $parsedDoc .= $parsedForm; $parsedDoc .= "\n"; } # generate parsed tables my(@tbls) = @{$document->{'tables'}}; my($i); for $i (0 .. $#tbls) { # clean up $tbls[$i] = EDGAR::XML::decode($tbls[$i]); push(@{$document->{'parsed-tables'}}, {EDGAR::Table::parse($tbls[$i])}); } # table sequence $i = 1; for (@{$document->{'parsed-tables'}}) { # generate XML parsed table data $parsedDoc .= &EDGAR::Table::toXML($_, $sequence, $i, ""); $i++; } if ($parsedDoc) { $parsedDocs .= "\n"; $parsedDocs .= $parsedDoc . "\n"; $parsedDocs .= "" . "\n"; } } # do we have parsed edgar data? if ($parsedDocs) { # XML IDs cannot start with a number, a "_" or a letter is fine my($id) = ''; $id = "_" . $edgardoc{'submission'}{'accession-number'}; my($xml) = ''; $xml = "" . "\n"; $xml .= $parsedDocs; $xml .= "" . "\n"; $parsedDocs = $xml; } $parsedDocs; } # # generate stamps XML data # sub stamps2xml { my($edgar) = shift; my($xml) = ''; # do we have stamps to include? my($have_stamps) = 0; # accession # for this document my($accno) = $edgar->{'accession-number'}; # stamp file -- PGP ascii-armor signature my($stamp) = "$accno.xml.asc"; # look for our stamp file if ( ! -r $stamp ) { # look in standard install directory my($path) = $EDGAR::lib::edgar_root . "/xml"; $path .= "/" . EDGAR::util::accno2path($accno); $stamp = "$path/$stamp"; } $have_stamps = ( -r $stamp ); if ( $have_stamps ) { $xml .= "" . "\n"; # load PGP signature file my(@signature) = `/bin/cat $stamp`; chomp(@signature); $xml .= "" . "\n"; $xml .= join("\n", @signature) . "\n"; $xml .= "" . "\n"; $xml .= "" . "\n"; } $xml; } # # debug stuff # sub DEBUG { my($doc) = shift; my %transformer = getTransformer($doc); print STDERR "Transformer:\n"; for (sort keys(%transformer)) { print STDERR "$_ => ($transformer{$_})\n"; } my %submission = getSubmission($doc); print STDERR "Submission:\n"; for (sort keys(%submission)) { if (ref($submission{$_}) eq "ARRAY") { print STDERR "$_: ["; for (@{$submission{$_}}) { my %hash = %{$_}; print STDERR " {"; for (sort keys(%hash)) { if (ref($hash{$_}) eq "ARRAY") { my@a = @{$hash{$_}}; print STDERR "$_: ["; for (@a) { if (ref($_) eq "HASH") { my %h = %{$_}; print STDERR " {"; for (sort keys(%h)) { print STDERR "($_ => $h{$_})"; } print STDERR "} "; } } print STDERR "] "; } elsif (ref($hash{$_}) eq "HASH") { my %h = %{$hash{$_}}; print STDERR "$_: {"; for (sort keys(%h)) { print STDERR "($_ => $h{$_})"; } print STDERR " } "; } else { print STDERR "($_ => $hash{$_})"; } } print STDERR "}"; } print STDERR " ]\n"; } else { print STDERR "$_ => ($submission{$_})\n"; } } my @stamps = getStamps($doc); print STDERR "Stamps:\n"; for (@stamps) { my %stamp = %{$_}; for (sort keys(%stamp)) { print STDERR "$_ => ($stamp{$_})\n"; } } 0; } # keep require happy -- DO NOT REMOVE 1; =head1 NAME edgar-xml.pl - EDGAR XML generation support =head1 PACKAGE EDGAR::XML =head1 SYNOPSIS require edgar-xml.pl; =head1 REQUIRES Perl, version 5.001 or higher. =head1 DESCRIPTION EDGAR XML generation routines. =over 3 =head1 METHODS =head2 header =item * generates standard XML header =item * An XML header is returned. =item example: my ($header) = &EDGAR::XML::header($edgar); =head2 transformer =item * generates transformer information for EDGAR doc =item * An XML transformer string is returned. =item example: my ($transformer) = &EDGAR::XML::transformer($edgar); =item example XML: Invisible Worlds, Inc.
660 York Street San Francisco California 94110 US edgar@invisible.net http://invisible.net/
01-12-1999 ftp://ftp.sec.gov/edgar/data/888/777777.txt
=head2 trailer =item * generates standard XML trailer =item * An XML trailer is returned. =item example: my ($trailer) = &EDGAR::XML::trailer($edgar); =head2 encode =item * encode reserved characters =item * A string with encoded reserved characters is returned. =item example: my ($encoded) = &EDGAR::XML::encode($text); =item encoding performed: =item & --> & =item < --> < =head2 decode =item * decode reserved characters =item * A string with decoded reserved characters is returned. =item example: my ($decoded) = &EDGAR::XML::decode($text); =item decoding performed: =item & --> & =item < --> < =item > --> > =head2 trim =item * removes trailing and leading white space from a string =item example: my ($trimmed) = &EDGAR::XML::trim($str); =head2 encode_av =item * encode attribute values =item * A string with attribute values encoded is returned. Quotes are put around the value. =item example: my ($encoded) = &EDGAR::XML::encode_av($text); =item example text: Bill & Ted --> "Bill & Ted" =head2 parse =item * parse EDGAR XML format document stream =item * A hash array is returned containing the transformer, submission, parsed-edgar, and stamps sections of the edgar document. =item example: my (%edgarDoc) = &EDGAR::XML::parse($text); =head2 parsefile =item * parse EDGAR XML format document file =item * A hash array is returned containing the transformer, submission, parsed-edgar, and stamps sections of the edgar document. =item example: my (%edgarDoc) = &EDGAR::XML::parse($fileName); =head2 getTransformer =item * get transformer element data =item * A hash array containing the transformer data is returned. =item example: my (%transformer) = &EDGAR::XML::getTransformer(%doc); =head2 getStamps =item * get stamps element data =item * A hash array containing the stamps data is returned. =item example: my (%stamps) = &EDGAR::XML::getStamps(%doc); =head2 getSubmission =item * get submission element data =item * A hash array containing the submission data is returned. =item example: my (%submission) = &EDGAR::XML::getSubmission(%doc); =head2 getParsedEdgar =item * get parsed EDGAR element data =item * A hash array containing the parsed EDGAR data is returned. =item example: my (%parsedEdgar) = &EDGAR::XML::getParsedEdgar(%doc); =head2 getDocument =item * get document element data =item * A hash array containing the document data is returned. =item example: my (%document) = &EDGAR::XML::getDocument(%doc); =head2 getParsedDocument =item * get parsed document element data =item * A hash array containing the parsed document data is returned. =item example: my (%parsedDoc) = &EDGAR::XML::getParsedDocument(%doc); =head2 getDocumentTables =item * get document table element data =item * An array containing the document tables is returned. =item example: my (@tables) = &EDGAR::XML::getDocumentTables(%doc); =head2 getRegistrant =item * get registrant element data =item * A hash array containing the registrant data is returned. =item example: my (%registrant) = &EDGAR::XML::getRegistrant(%doc); =head2 getCompanyData =item * get company element data =item * A hash array containing the company data is returned. =item example: my (%company) = &EDGAR::XML::getCompanyData(%doc); =head2 getFilingValues =item * get filing element data =item * A hash array containing the filing data is returned. =item example: my (%filing) = &EDGAR::XML::getFilingValues(%doc); =head2 getBusinessAddress =item * get business address element data =item * A hash array containing the business address data is returned. =item example: my (%businessAddr) = &EDGAR::XML::getBusinessAddress(%doc); =head2 getMailAddress =item * get mail address element data =item * A hash array containing the mail address data is returned. =item example: my (%mailAddr) = &EDGAR::XML::getMailAddress(%doc); =head2 getFormerCompanyData =item * get former company element data =item * A hash array containing the former company data is returned. =item example: my (%formerCompany) = &EDGAR::XML::getFormerCompanyData(%doc); =head2 getDocumentParsedTables =item * get document parsed table element data =item * An array of parsed table elements are returned. =item example: my (@tables) = &EDGAR::XML::getDocumentParsedTables(%doc); =head2 parsedEdgar =item * generate parsed EDGAR XML, if appropriate =item * A string of parsed EDGAR in XML format is returned. =item example: my ($parsedEdgarXML) = &EDGAR::XML::parsedEdgar($text); =head2 stamps2xml =item * generate stamps XML data =item * A string of stamps data XML format is returned. =item example: my ($stampsXML) = &EDGAR::XML::stamps2xml($text); =back =head1 COPYRIGHT Copyright 1999 Invisible Worlds. =cut