#
#
# file: edgar-xml.pl
# desc: EDGAR XML generation support
#
# limit scope
package EDGAR::XML;
use strict;
BEGIN {
# XML Document Object Model support
use XML::DOM;
my $iw_root = ($ENV{IW}) ? $ENV{IW} : "/iw";
my $edgar_root = "$iw_root/skulker/edgar/current";
# where we find our local libraries
my($libdir) = "$edgar_root/lib";
if ( -d $libdir) {
unshift(@INC, $libdir);
}
# EDGAR Parse types, classifications, and support routines
require 'edgar-parse.pl';
# EDGAR generic utility support
require 'edgar-util.pl';
}
# debug mode?
my($debug) = 0;
# SEC EDGAR to XML DTD (for validation)
my $DTD = 'http://xml.resource.org/doc/edgar/edgardoc.dtd';
my $DOCTYPE = qq|PUBLIC "-//Blocks//DTD EDGARDOC//EN" "$DTD"|;
## for test/debug - bjb, 26-apr-1999
#my $DTD = 'edgartest.dtd';
#my $DOCTYPE = qq|SYSTEM "$DTD"|;
#
# generate standard XML header
#
sub header {
my($edgar) = shift;
# header we will build and return
my($hdr);
# XML version
$hdr = "" . "\n";
# will this change?
$hdr .= "" . "\n\n";
$hdr .= "" . "\n";
$hdr .= transformer($edgar);
$hdr;
}
#
# XML transformer info
#
sub transformer {
my($edgar) = shift;
my($xml) = '';
# transform date
my($date) = `/bin/date +"%d-%h-%Y, %T %Z"`;
chomp($date);
my($accno) = $edgar->{'accession-number'};
my($cik) = $edgar->{'cik'};
$cik =~ s/^0+//;
my($url) = "ftp://ftp.sec.gov/edgar/data";
$url .= "/" . $cik;
$url .= "/" . $accno . ".txt";
$xml = <
Invisible Worlds, Inc.
1179 N. McDowell Blvd
Petaluma
California
94954
US
edgar\@invisible.net
http://invisible.net/
$date
$url
EoD
$xml;
}
#
# generate XML submission
#
sub submission {
my($submission) = shift;
my($xml) = '';
$xml;
}
#
# generate standard XML trailer
#
sub trailer {
my($edgar) = shift;
my($trlr) = "";
$trlr .= "\n";
$trlr .= stamps2xml($edgar);
$trlr .= "\n";
$trlr .= "" . "\n";
$trlr;
}
#
# encode reserved characters
#
sub encode {
my($foo) = shift;
# avoid double encoding &
$foo =~ s/\&/\&/g;
# order is important with & substitution -- keep it first
$foo =~ s/\&/\&/g;
$foo =~ s/\</g;
# not strictly required, but sei chokes on /> - bjb, 26-nov-1999
$foo =~ s/>/\>/g;
$foo;
}
#
# decode reserved characters
#
sub decode {
my($foo) = shift;
$foo =~ s/\&/\&/g;
$foo =~ s/\<//g;
$foo;
}
#
# EDGAR XML parsing support
#
#
# parse Invisible Worlds EDGAR XML format document stream
#
sub parse {
my($data) = shift;
# parse input XML file into DOM structure
my $parser = new XML::DOM::Parser;
my $doc;
eval { $doc = $parser->parse($data, ErrorContext => 2); };
if ($@) {
my $accno = $data;
$accno =~ s|^.*(.*?).*$|$1|s;
# parse error?
warn "EDGAR::XML::parse: $accno: $@\n";
return ();
}
# extract pieces of edgardoc from DOM tree
my %edgardoc = ();
%{$edgardoc{'transformer'}} = getTransformer($doc);
%{$edgardoc{'submission'}} = getSubmission($doc);
%{$edgardoc{'parsed-edgar'}} = getParsedEdgar($doc);
@{$edgardoc{'stamps'}} = getStamps($doc);
# clean up DOM internals -- REQUIRED to avoid memory leaks
$doc->dispose()
if ($doc);
%edgardoc;
}
#
# parse Invisible Worlds EDGAR XML format document file
#
sub parsefile {
my($file) = shift;
# parse input XML file into DOM structure
my $parser = new XML::DOM::Parser;
my $doc;
eval { $doc = $parser->parsefile($file, ErrorContext => 2); };
if ($@) {
# parse error?
warn "EDGAR::XML::parsefile: $file: $@\n";
return ();
}
# extract pieces of edgardoc from DOM tree
my %edgardoc = ();
%{$edgardoc{'transformer'}} = getTransformer($doc);
%{$edgardoc{'submission'}} = getSubmission($doc);
%{$edgardoc{'parsed-edgar'}} = getParsedEdgar($doc);
@{$edgardoc{'stamps'}} = getStamps($doc);
# clean up DOM internals -- REQUIRED to avoid memory leaks
$doc->dispose()
if ($doc);
%edgardoc;
}
#
# get transformer element data
#
sub getTransformer {
my($doc) = shift;
my %transformer = ();
my $tf = $doc->getElementsByTagName("transformer")->item(0);
return () unless ($tf);
my $tf_org = $tf->getElementsByTagName("organization")->item(0);
if ($tf_org) {
$transformer{'organization'} = getElementData($tf_org);
}
my $tf_addr = $tf->getElementsByTagName("address")->item(0);
if ($tf_addr) {
my $tf_postal = $tf_addr->getElementsByTagName("postal")->item(0);
if ($tf_postal) {
for my $node ($tf_postal->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
my $val = '';
next if ($name eq '#text');
if ($node->hasChildNodes()) {
$val = $node->getFirstChild()->getData();
} else {
$val = $node->getData();
}
if ($transformer{$name}) {
$transformer{$name} .= "|" . $val;
} else {
$transformer{$name} = $val;
}
}
}
}
my $tf_email = $tf_addr->getElementsByTagName("email")->item(0);
if ($tf_email) {
$transformer{'email'} = getElementData($tf_email);
}
my $tf_uri = $tf_addr->getElementsByTagName("uri")->item(0);
if ($tf_uri) {
$transformer{'uri'} = getElementData($tf_uri);
}
my $tf_date = $tf->getElementsByTagName("date")->item(0);
if ($tf_date) {
$transformer{'date'} = getElementData($tf_date);
}
my $tf_canon = $tf->getElementsByTagName("canonical")->item(0);
if ($tf_canon) {
my $tf_canon_uri = $tf_canon->getElementsByTagName("uri")->item(0);
if ($tf_canon_uri) {
$transformer{'canonical'} = getElementData($tf_canon_uri);
}
}
return %transformer;
}
#
# get stamps element data
#
sub getStamps {
my($doc) = shift;
my @stamps = ();
my $st = $doc->getElementsByTagName("stamps")->item(0);
return () unless ($st);
if ($st->hasChildNodes()) {
for my $node ($st->getChildNodes()) {
my $name = $node->getNodeName();
my $val = '';
next if ($name eq '#text');
if ($node->hasChildNodes()) {
$val = $node->getFirstChild()->getData();
} else {
$val = $node->getData();
}
push(@stamps, {type => $name, data => $val});
}
}
return @stamps;
}
#
# get SUBMISSION element data
#
sub getSubmission {
my($doc) = shift;
# there will be only one
my $submission = $doc->getElementsByTagName("SUBMISSION")->item(0);
return () unless ($submission);
my %submission = ();
for my $node ($submission->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
# we only care about the ELEMENTs of the submission
next unless ($type == ELEMENT_NODE);
# registrant data
if ($name =~ /^(FILED-BY|FILER|REPORTING-OWNER|SERIAL-COMPANY|SUBJECT-COMPANY)$/o) {
push(@{$submission{'registrants'}}, {getRegistrant($node)} );
# document data
} elsif ($name =~ /^DOCUMENT$/o) {
push(@{$submission{'documents'}}, {getDocument($node)} );
# submission header data
} else {
$name =~ tr/A-Z/a-z/;
$submission{$name} = getElementData($node);
}
}
return %submission;
}
#
# get parsed.edgar element data
#
sub getParsedEdgar {
my($doc) = shift;
# there will be only one
my $parsed = $doc->getElementsByTagName("parsed.edgar")->item(0);
return () unless ($parsed);
my %parsedEdgar = ();
for my $node ($parsed->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
# we only care about the ELEMENTs of the submission
next unless ($type == ELEMENT_NODE);
# parsed.document data
if ($name =~ /^parsed\.document$/o) {
push(@{$parsedEdgar{'parsed-documents'}}, {getParsedDocument($node)} );
# submission header data
} else {
$name =~ tr/A-Z/a-z/;
$parsedEdgar{$name} = getElementData($node);
}
}
return %parsedEdgar;
}
#
# get DOCUMENT element data
#
sub getDocument {
my($doc) = shift;
my %document = ();
for my $node ($doc->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
# we only care about the ELEMENTs of the submission
next unless ($type == ELEMENT_NODE);
# TEXT data?
if ($name eq "TEXT") {
$name =~ tr/A-Z/a-z/;
$document{$name} = $node->toString();
# extract/parse document table elements
$document{'tables'} = [ getDocumentTables($node) ];
} else {
$name =~ tr/A-Z/a-z/;
$document{$name} = getElementData($node);
}
}
return %document;
}
#
# get parsed.document element data
#
sub getParsedDocument {
my($doc) = shift;
my %document = ();
for my $node ($doc->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
# we only care about the ELEMENTs of the submission
next unless ($type == ELEMENT_NODE);
# TEXT data?
if ($name eq "parsed.form") {
$name =~ s/\./\-/g;
$document{$name} = $node->toString();
} else {
$name =~ tr/A-Z/a-z/;
$document{$name} = getElementData($node);
}
}
$document{'parsed-tables'} = { getParsedDocumentTables($doc) };
return %document;
}
#
# get DOCUMENT TABLE element data
#
sub getDocumentTables {
my($doc) = shift;
my @tables = ();
for my $node ($doc->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
# we only care about the TABLE ELEMENTs of the DOCUMENT
next unless ($type == ELEMENT_NODE && $name eq 'TABLE');
# save TABLE data
push(@tables, $node->toString());
}
return @tables;
}
#
# get DOCUMENT parsed TABLE element data
#
sub getParsedDocumentTables {
my($doc) = shift;
# order isn't important for these...we use table id as key
my %tables = ();
for my $node ($doc->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
# we only care about the TABLE ELEMENTs of the DOCUMENT
next unless ($type == ELEMENT_NODE && $name eq 'parsed.table');
my($id) = $node->getAttribute("id");
# save TABLE data
$tables{$id} = $node->toString();
}
return %tables;
}
#
# get SUBMISSION registrant data
#
sub getRegistrant {
my($doc) = shift;
my %registrant = ();
# registrant type
my $type = $doc->getNodeName();
$type =~ tr/A-Z/a-z/;
$registrant{'type'} = $type;
for my $node ($doc->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
# we only care about the ELEMENTs of the submission
next unless ($type == ELEMENT_NODE);
$name =~ tr/A-Z/a-z/;
# company data
if ($name =~ /^company-data$/o) {
$registrant{$name} = {getCompanyData($node)};
# filing values
} elsif ($name =~ /^filing-values$/o) {
push(@{$registrant{$name}}, {getFilingValues($node)} );
# business address
} elsif ($name =~ /^business-address$/o) {
$registrant{$name} = {getBusinessAddress($node)};
# mail address
} elsif ($name =~ /^mail-address$/o) {
$registrant{$name} = {getMailAddress($node)};
# former company data
} elsif ($name =~ /^former-company$/o) {
push(@{$registrant{$name}}, {getFormerCompanyData($node)} );
# unknown element?
} else {
;;
}
}
return %registrant;
}
#
# get company data
#
sub getCompanyData {
my($doc) = shift;
my %company = ();
for my $node ($doc->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
next unless ($type == ELEMENT_NODE);
$name =~ tr/A-Z/a-z/;
$company{$name} = getElementData($node);
}
return %company;
}
#
# get filing values data
#
sub getFilingValues {
my($doc) = shift;
my %filing = ();
for my $node ($doc->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
next unless ($type == ELEMENT_NODE);
$name =~ tr/A-Z/a-z/;
$filing{$name} = getElementData($node);
}
return %filing;
}
#
# get business address
#
sub getBusinessAddress {
my($doc) = shift;
my %address = ();
for my $node ($doc->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
next unless ($type == ELEMENT_NODE);
$name =~ tr/A-Z/a-z/;
$address{$name} = getElementData($node);
}
return %address;
}
#
# get mail address
#
sub getMailAddress {
my($doc) = shift;
my %address = ();
for my $node ($doc->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
next unless ($type == ELEMENT_NODE);
$name =~ tr/A-Z/a-z/;
$address{$name} = getElementData($node);
}
return %address;
}
#
# get former company data
#
sub getFormerCompanyData {
my($doc) = shift;
my %company = ();
for my $node ($doc->getChildNodes()) {
my $name = $node->getNodeName();
my $type = $node->getNodeType();
next unless ($type == ELEMENT_NODE);
$name =~ tr/A-Z/a-z/;
$company{$name} = getElementData($node);
}
return %company;
}
#
# get generic element data for specified element ($doc)
#
sub getElementData {
my($doc) = shift;
my($data);
if ($doc->hasChildNodes()) {
$data = $doc->getFirstChild()->getData();
} else {
$data = "";
}
return $data;
}
#
# generate parsed edgar XML, if appropriate
#
sub parsedEdgar {
my($data) = shift;
my($document);
my %edgardoc = parse($data);
# parsed edgar docs
my($parsedDocs) = '';
foreach $document (@{$edgardoc{'submission'}{'documents'}}) {
# individual parsed document
my($parsedDoc) = '';
# document data
my($type, $sequence, $description);
$type = $document->{'type'};
$sequence = $document->{'sequence'};
$description = $document->{'description'};
print STDERR "parsedEdgar: type: ($type) sequence: ($sequence)\n"
if ($debug > 1);
$logger::DEBUG->increment('-name' => $type);
# generate parsed form, if we can
## disabled due to broken XML generation - bjb, 04-mar-2000
## my($parsedForm) = EDGAR::Forms::parse($type, $document->{'text'});
my($parsedForm) = "";
if ($parsedForm) {
my($id);
$id = "document_" . $sequence . "_type_" . $type;
# parsed.form is of type ID and spaces aren't allowed
$id =~ s/\s+/_/g;
print STDERR "parsedEdgar: id: ($id)\n"
if ($debug > 1);
$parsedDoc .= "\n";
$parsedDoc .= $parsedForm;
$parsedDoc .= "\n";
}
# generate parsed tables
my(@tbls) = @{$document->{'tables'}};
my($i);
for $i (0 .. $#tbls) {
# clean up
$tbls[$i] = EDGAR::XML::decode($tbls[$i]);
push(@{$document->{'parsed-tables'}},
{EDGAR::Table::parse($tbls[$i])});
}
# table sequence
$i = 1;
for (@{$document->{'parsed-tables'}}) {
# generate XML parsed table data
$parsedDoc .= &EDGAR::Table::toXML($_, $sequence, $i, "");
$i++;
}
if ($parsedDoc) {
$parsedDocs .= "\n";
$parsedDocs .= $parsedDoc . "\n";
$parsedDocs .= "" . "\n";
}
}
# do we have parsed edgar data?
if ($parsedDocs) {
# XML IDs cannot start with a number, a "_" or a letter is fine
my($id) = '';
$id = "_" . $edgardoc{'submission'}{'accession-number'};
my($xml) = '';
$xml = "" . "\n";
$xml .= $parsedDocs;
$xml .= "" . "\n";
$parsedDocs = $xml;
}
$parsedDocs;
}
#
# generate stamps XML data
#
sub stamps2xml {
my($edgar) = shift;
my($xml) = '';
# do we have stamps to include?
my($have_stamps) = 0;
# accession # for this document
my($accno) = $edgar->{'accession-number'};
# stamp file -- PGP ascii-armor signature
my($stamp) = "$accno.xml.asc";
# look for our stamp file
if ( ! -r $stamp ) {
# look in standard install directory
my($path) = $EDGAR::lib::edgar_root . "/xml";
$path .= "/" . EDGAR::util::accno2path($accno);
$stamp = "$path/$stamp";
}
$have_stamps = ( -r $stamp );
if ( $have_stamps ) {
$xml .= "" . "\n";
# load PGP signature file
my(@signature) = `/bin/cat $stamp`;
chomp(@signature);
$xml .= "" . "\n";
$xml .= join("\n", @signature) . "\n";
$xml .= "" . "\n";
$xml .= "" . "\n";
}
$xml;
}
#
# debug stuff
#
sub DEBUG {
my($doc) = shift;
my %transformer = getTransformer($doc);
print STDERR "Transformer:\n";
for (sort keys(%transformer)) {
print STDERR "$_ => ($transformer{$_})\n";
}
my %submission = getSubmission($doc);
print STDERR "Submission:\n";
for (sort keys(%submission)) {
if (ref($submission{$_}) eq "ARRAY") {
print STDERR "$_: [";
for (@{$submission{$_}}) {
my %hash = %{$_};
print STDERR " {";
for (sort keys(%hash)) {
if (ref($hash{$_}) eq "ARRAY") {
my@a = @{$hash{$_}};
print STDERR "$_: [";
for (@a) {
if (ref($_) eq "HASH") {
my %h = %{$_};
print STDERR " {";
for (sort keys(%h)) {
print STDERR "($_ => $h{$_})";
}
print STDERR "} ";
}
}
print STDERR "] ";
} elsif (ref($hash{$_}) eq "HASH") {
my %h = %{$hash{$_}};
print STDERR "$_: {";
for (sort keys(%h)) {
print STDERR "($_ => $h{$_})";
}
print STDERR " } ";
} else {
print STDERR "($_ => $hash{$_})";
}
}
print STDERR "}";
}
print STDERR " ]\n";
} else {
print STDERR "$_ => ($submission{$_})\n";
}
}
my @stamps = getStamps($doc);
print STDERR "Stamps:\n";
for (@stamps) {
my %stamp = %{$_};
for (sort keys(%stamp)) {
print STDERR "$_ => ($stamp{$_})\n";
}
}
0;
}
# keep require happy -- DO NOT REMOVE
1;
=head1 NAME
edgar-xml.pl - EDGAR XML generation support
=head1 PACKAGE
EDGAR::XML
=head1 SYNOPSIS
require edgar-xml.pl;
=head1 REQUIRES
Perl, version 5.001 or higher.
=head1 DESCRIPTION
EDGAR XML generation routines.
=over 3
=head1 METHODS
=head2 header
=item *
generates standard XML header
=item *
An XML header is returned.
=item example:
my ($header) = &EDGAR::XML::header($edgar);
=head2 transformer
=item *
generates transformer information for EDGAR doc
=item *
An XML transformer string is returned.
=item example:
my ($transformer) = &EDGAR::XML::transformer($edgar);
=item example XML:
Invisible Worlds, Inc.
660 York Street
San Francisco
California
94110
US
edgar@invisible.net
http://invisible.net/
01-12-1999
ftp://ftp.sec.gov/edgar/data/888/777777.txt
=head2 trailer
=item *
generates standard XML trailer
=item *
An XML trailer is returned.
=item example:
my ($trailer) = &EDGAR::XML::trailer($edgar);
=head2 encode
=item *
encode reserved characters
=item *
A string with encoded reserved characters is returned.
=item example:
my ($encoded) = &EDGAR::XML::encode($text);
=item encoding performed:
=item
& --> &
=item
< --> <
=head2 decode
=item *
decode reserved characters
=item *
A string with decoded reserved characters is returned.
=item example:
my ($decoded) = &EDGAR::XML::decode($text);
=item decoding performed:
=item
& --> &
=item
< --> <
=item
> --> >
=head2 trim
=item *
removes trailing and leading white space from a string
=item example:
my ($trimmed) = &EDGAR::XML::trim($str);
=head2 encode_av
=item *
encode attribute values
=item *
A string with attribute values encoded is returned. Quotes are put around the value.
=item example:
my ($encoded) = &EDGAR::XML::encode_av($text);
=item example text:
Bill & Ted --> "Bill & Ted"
=head2 parse
=item *
parse EDGAR XML format document stream
=item *
A hash array is returned containing the transformer, submission, parsed-edgar, and stamps sections of the edgar document.
=item example:
my (%edgarDoc) = &EDGAR::XML::parse($text);
=head2 parsefile
=item *
parse EDGAR XML format document file
=item *
A hash array is returned containing the transformer, submission, parsed-edgar, and stamps sections of the edgar document.
=item example:
my (%edgarDoc) = &EDGAR::XML::parse($fileName);
=head2 getTransformer
=item *
get transformer element data
=item *
A hash array containing the transformer data is returned.
=item example:
my (%transformer) = &EDGAR::XML::getTransformer(%doc);
=head2 getStamps
=item *
get stamps element data
=item *
A hash array containing the stamps data is returned.
=item example:
my (%stamps) = &EDGAR::XML::getStamps(%doc);
=head2 getSubmission
=item *
get submission element data
=item *
A hash array containing the submission data is returned.
=item example:
my (%submission) = &EDGAR::XML::getSubmission(%doc);
=head2 getParsedEdgar
=item *
get parsed EDGAR element data
=item *
A hash array containing the parsed EDGAR data is returned.
=item example:
my (%parsedEdgar) = &EDGAR::XML::getParsedEdgar(%doc);
=head2 getDocument
=item *
get document element data
=item *
A hash array containing the document data is returned.
=item example:
my (%document) = &EDGAR::XML::getDocument(%doc);
=head2 getParsedDocument
=item *
get parsed document element data
=item *
A hash array containing the parsed document data is returned.
=item example:
my (%parsedDoc) = &EDGAR::XML::getParsedDocument(%doc);
=head2 getDocumentTables
=item *
get document table element data
=item *
An array containing the document tables is returned.
=item example:
my (@tables) = &EDGAR::XML::getDocumentTables(%doc);
=head2 getRegistrant
=item *
get registrant element data
=item *
A hash array containing the registrant data is returned.
=item example:
my (%registrant) = &EDGAR::XML::getRegistrant(%doc);
=head2 getCompanyData
=item *
get company element data
=item *
A hash array containing the company data is returned.
=item example:
my (%company) = &EDGAR::XML::getCompanyData(%doc);
=head2 getFilingValues
=item *
get filing element data
=item *
A hash array containing the filing data is returned.
=item example:
my (%filing) = &EDGAR::XML::getFilingValues(%doc);
=head2 getBusinessAddress
=item *
get business address element data
=item *
A hash array containing the business address data is returned.
=item example:
my (%businessAddr) = &EDGAR::XML::getBusinessAddress(%doc);
=head2 getMailAddress
=item *
get mail address element data
=item *
A hash array containing the mail address data is returned.
=item example:
my (%mailAddr) = &EDGAR::XML::getMailAddress(%doc);
=head2 getFormerCompanyData
=item *
get former company element data
=item *
A hash array containing the former company data is returned.
=item example:
my (%formerCompany) = &EDGAR::XML::getFormerCompanyData(%doc);
=head2 getDocumentParsedTables
=item *
get document parsed table element data
=item *
An array of parsed table elements are returned.
=item example:
my (@tables) = &EDGAR::XML::getDocumentParsedTables(%doc);
=head2 parsedEdgar
=item *
generate parsed EDGAR XML, if appropriate
=item *
A string of parsed EDGAR in XML format is returned.
=item example:
my ($parsedEdgarXML) = &EDGAR::XML::parsedEdgar($text);
=head2 stamps2xml
=item *
generate stamps XML data
=item *
A string of stamps data XML format is returned.
=item example:
my ($stampsXML) = &EDGAR::XML::stamps2xml($text);
=back
=head1 COPYRIGHT
Copyright 1999 Invisible Worlds.
=cut