#!/iw/perl/current/bin/perl # # file: edgar2dtd # desc: convert SEC EDGAR raw submissions to EDGAR DTD XML format # # limit scope...keep this first package EDGAR::dtd; eval 'exec /iw/perl/current/bin/perl -s $0 ${1+"$@"}' if 0; use strict; use vars qw($VERSION $RELEASE); # # current module verion my $Id =<<'EoI'; # $Id: //depot/isms/skulker/edgar/1.2.4/bin/edgar2dtd#5 $ EoI # my $RELEASE = sprintf("%d", $Id =~ /^# \$Id: .*#(\d+)/); my $VERSION = "1.2.2"; BEGIN { my $iw_root = ($ENV{IW}) ? $ENV{IW} : "/iw"; my $edgar_root = "$iw_root/skulker/edgar/current"; # where we find our local libraries my($libdir) = "$edgar_root/lib"; if ( -d $libdir) { unshift(@INC, $libdir); } # general EDGAR utility support routines require 'edgar-util.pl'; # general EDGAR library support routines require 'edgar-lib.pl'; # EDGAR XML support require 'edgar-xml.pl'; # parsed EDGAR support require 'edgar-parse.pl'; } # we want perl 5.00x or later require 5.004; # who am i? my $prog; ($prog = $0) =~ s#.*/##; # for processing command line options use Getopt::Std; # process command line options, if any my %opt; getopts('D:b:h:x:vH', \%opt) || &usage($prog); # where to place submissions my $basedir = defined($opt{'b'}) ? "$opt{'b'}" : $EDGAR::lib::edgar_data; EDGAR::util::makepath($basedir, 0775); # debug mode? my $debug = defined($opt{'D'}) ? $opt{'D'} : 0; # where to place normal HTML submissions my $htmldir = defined($opt{'h'}) ? "$opt{'h'}" : "$basedir/html"; EDGAR::util::makepath($htmldir, 0775); # where to place normal submissions my $xmldir = defined($opt{'x'}) ? "$opt{'x'}" : "$basedir/xml"; EDGAR::util::makepath($xmldir, 0775); # print the version if ($opt{'v'}) { print &version(), "\n"; exit; } # print the usage &usage($prog) if $opt{'H'}; sub usage { my $prog = shift; print "usage: $prog [-D Debug_level] [-b base_dir] [-x xml_base_dir] " . "[-h html_base_dir [-v] [-H] input_file\n"; print " -D : debug mode, 0-9\n"; print " -b : base directory\n"; print " -h : html base directory\n"; print " -x : xml base directory\n"; print " -v : version, print out the version number and exit\n"; print " -H : help, print out this message and exit\n"; print "\n"; exit; } # Debugging package, currently in edgar/lib, should be in common/lib use Logger; # set up stuff for run log and Debugging my $do_logging = 0; $do_logging = 1 if ($debug > 8); $logger::DEBUG = Logger->new(); $logger::DEBUG->setupLogger('-base-dir' => "/tmp", '-base-filename' => $prog, '-do-logging' => $do_logging); $logger::DEBUG->logDateTime(); my $file; FILE: foreach $file (@ARGV) { my(@submission) = (); my(%edgar) = (); my(%submission) = (); # type in used only for logging, and only when do_logging is on my $type = 0; print STDERR "Processing $file ...\n" if ($debug); open(IN, "$file") || die "$prog: error reading $file: $!\n"; # submission accession number my($accno) = ''; # company CIK in case it doesn't match accession number. used by the # edgar2html transformer for canonical url. my(@ciks) = (); # slurp in the whole file while () { # already in DTD format? next FILE if (/^/) { ($accno = $_) =~ s|^.*(.*).*$|$1|; # make sure we have a valid accession number unless ($accno =~ /^\d+-\d\d-\d+$/) { print STDERR "Invalid accession # ($accno) ... skipping $file\n"; next FILE; } print STDERR " ** DEBUG1: accno: ($accno) **\n" if ($debug > 1); } # only care about CIKs in the header data # DO NOT process them in DOCUMENTs if ($in_hdr && //) { my($cik); ($cik = $_) =~ s|^.*(.*).*$|$1|; # make sure we have a valid CIK if ($cik =~ /^\d+$/) { # we ignore test CIKs ... for now at least next FILE if ($EDGAR::lib::testCIK{$cik}); # save the CIK(s) push(@ciks, $cik); } } if ($do_logging && !$type) { if (m|^.*(.*)|) { $type = $1; } } # first DOCUMENT ends our header if ($in_hdr && //) { $in_hdr = 0; } # clean up some possible problems from earlier processing runs s/\&amp;/\&/g; push(@submission, $_); } close(IN); unless ($accno =~ /^\d+-\d\d-\d+$/) { print STDERR "Missing/invalid accession # ($accno) ... skipping $file\n" if ($debug > 1); next FILE; } # fake edgar info for XML for now $edgar{'accession-number'} = $accno; my($need_cik) = 1; # do we need to track CIK separately? for (@ciks) { if ($_ eq (split('-',$accno))[0]) { $edgar{'cik'} = $_; $need_cik = 0; last; } } # did at least one CIK match our accession number? if not, ... if ($need_cik) { $edgar{'cik'} = $ciks[0]; } $logger::DEBUG->logFileInfo('-file-name' => $file, '-accno' => $accno, '-type' => $type); # output edgardoc XML submission my $ofile = &output_XML(\%edgar, \@submission); # install extracted source documents &install_extracts($edgar{'accession-number'}) unless ($debug); # unbuffered stdout select STDOUT; $| = 1; # display output file name for subsequent processing print $ofile, "\n"; $logger::DEBUG->outputLogger(); $logger::DEBUG->reInit(); } # c'ya exit 0; # # output XML submission # sub output_XML { my($edgar) = shift; my($submission) = shift; my($accno) = $edgar->{'accession-number'}; my($path) = EDGAR::util::accno2path($accno); unless ($path) { # test submission has all zero CIK print STDERR " Skipping test submission: $accno\n" if ($debug); return -1; } my($xmlpath) = "$xmldir"; $xmlpath .= "/" . $path; # build the path if it doesn't exist EDGAR::util::makepath($xmlpath, 0775); my($ofile) = "$xmlpath/$accno.xml"; $ofile = "$accno.xml.merged" if ($debug > 1); print STDERR " ** DEBUG1: ofile: ($ofile) **\n" if ($debug > 1); open(XML, ">$ofile") || die "$prog: error writing $ofile: $!\n"; my($submissionData) = join("\n", @{$submission}). "\n"; # edgar header data print XML EDGAR::XML::header($edgar); # edgar submission data print XML $submissionData, "\n"; # parsed edgar data print XML EDGAR::XML::parsedEdgar($submissionData); # edgar trailer data print XML EDGAR::XML::trailer($edgar); # return output file name $ofile; } # # install extracted source documents # sub install_extracts { my $accno = shift; for (<$accno.*>) { my($ext) = (split(/\./))[2]; # make sure we have an extracted file next unless ($ext =~ /^gif|html|jpg|pdf|txt$/i); my($path) = EDGAR::util::accno2path($accno); my($htmlpath) = "$htmldir"; $htmlpath .= "/" . $path; # build the path if it doesn't exist EDGAR::util::makepath($htmlpath, 0775); # remove existing copy if (-f "$htmlpath/$_") { unlink "$htmlpath/$_"; } # make copy of file unless (link($_, "$htmlpath/$_")) { print STDERR " ** Error linking $htmlpath/$_: $!\n"; next; } # remove local copy unlink $_; } } sub version { my $ver = $VERSION . "r" . $RELEASE; if (__PACKAGE__ !~ /^main$/) { $ver = __PACKAGE__ . " " . $ver; } return $ver; }