#!/iw/perl/current/bin/perl # # file: edgar2html # desc: convert SEC EDGAR (edgardoc DTD format) XML documents to HTML # # limit scope...keep this first package EDGAR::html; eval 'exec /iw/perl/current/bin/perl -s $0 ${1+"$@"}' if 0; use strict; use vars qw($VERSION $RELEASE); # # current module verion my $Id =<<'EoI'; # $Id: //depot/isms/skulker/edgar/1.2.4/bin/edgar2html#1 $ EoI # my $RELEASE = sprintf("%d", $Id =~ /^# \$Id: .*#(\d+)/); my $VERSION = "1.2.2"; BEGIN { my $iw_root = ($ENV{IW}) ? $ENV{IW} : "/iw"; my $edgar_root = "$iw_root/skulker/edgar/current"; # where we find our local libraries my($libdir) = "$edgar_root/lib"; if ( -d $libdir) { unshift(@INC, $libdir); } # HTML template support use HTML::Template; # text to HTML conversion use HTML::FromText; # generic edgar support data require 'edgar-lib.pl'; # generic edgar utility support require 'edgar-util.pl'; # edgar HTML support require 'edgar-html.pl'; # edgar XML support require 'edgar-xml.pl'; } # we want perl 5.00x or later require 5.004; # who am i? my $prog; ($prog = $0) =~ s#.*/##; # for processing command line options use Getopt::Std; # process command line options, if any my %opt; getopts('D:Fb:h:V:vH', \%opt) || &usage($prog); # debug mode? my $debug = defined($opt{'D'}) ? $opt{'D'} : 0; # force HTML generation? my $force = defined($opt{'F'}); # which template version to use? my $tmplversion = (defined($opt{'V'})) ? $opt{'V'} : 2; # EDGAR base/root directory my $basedir = defined($opt{'b'}) ? "$opt{'b'}" : $EDGAR::lib::edgar_data; EDGAR::util::makepath($basedir, 0775); # where to place HTML submissions my $htmldir = defined($opt{'h'}) ? "$opt{'h'}" : "$basedir/html"; EDGAR::util::makepath($htmldir, 0775); # HTML template file my $htmltmpl = $EDGAR::HTML::TMPL; if ($tmplversion == 2) { $htmltmpl = $EDGAR::HTML::TMPLV2; } # print the version if ($opt{'v'}) { print &version(), "\n"; exit; } # print the usage &usage($prog) if $opt{'H'}; sub usage { my $prog = shift; print "usage: $prog [-D -F -b -h -V -v -H] input_file\n"; print " -D : debug mode, 0-9\n"; print " -F : force HTML generation\n"; print " -b : base dir EDGAR base/root directory\n"; print " -h : html base dir where to place HTML submission\n"; print " -V : template version, version 1 or 2\n"; print " -v : version, print out the version number and exit\n"; print " -H : help, print out this message and exit\n"; print "\n"; exit; } # set up stuff for run log and Debugging use Logger; # set up stuff for run log and Debugging my $do_logging = 0; $do_logging = 1 if ($debug > 8); $SIG{'ALRM'} = \&CATCH_SIGALRM; $SIG{'INT'} = \&CATCH_SIGINT; $logger::DEBUG = Logger->new(); $logger::DEBUG->setupLogger('-base-dir' => "/tmp", '-base-filename' => $prog, '-do-logging' => $do_logging); my($file, $pid); my $prog_pid = $prog . "-" . $$ . ".pid"; # and we're off... foreach $file (@ARGV) { unless ( -f $file ) { print STDERR "$prog: unable to read $file: $!\n" if ($debug > 1); next; } my $accno = $file; $accno =~ s|^.*/||; $accno =~ s|\.xml$||; print STDERR "$file" if ($debug); # get output filename my $ofile = get_filename($accno); # skip if it already exists ... for now if (-f $ofile && ! $force) { print STDERR " ... Skipping!\n" if ($debug); next; } print STDERR "\n" if ($debug); FORK: { if ($pid = fork) { # parent $logger::DEBUG->haltOn(900, $pid); wait; $logger::DEBUG->haltOff(); } elsif (defined $pid) { # child if ($do_logging && (! -e "/tmp/$prog_pid")) { `touch /tmp/$prog_pid`; $logger::DEBUG->logDateTime(); } # parse EDGAR XML format doc (edgardoc.dtd format) my %edgardoc = EDGAR::XML::parsefile($file); $logger::DEBUG->logFileInfo('-file-name' => $file, '-accno' => $edgardoc{'submission'}{'accession-number'}, '-type' => $edgardoc{'submission'}{'type'}); # generate HTML format edgardoc my $html = &EDGAR::HTML::toHTML(\%edgardoc, $htmltmpl, $tmplversion); # spit out the HTML &output_HTML($ofile, $html); $logger::DEBUG->outputLogger(); $logger::DEBUG->reInit(); exit(0); # kill the child } elsif ($! =~ /No more process/) { # recoverable fork error sleep 5; redo FORK; } else { # fork error die "Can't fork: $!\n"; } } } $logger::DEBUG->shutdownLogger(); `rm -f /tmp/$prog_pid` if (-e "/tmp/$prog_pid"); # c'ya exit 0; # # generate HTML output # sub output_HTML { my $ofile = shift; my $html = shift; print STDERR " ** DEBUG1: ofile: ($ofile) **\n" if ($debug > 1); open(HTML, ">$ofile") || die "$prog: error writing $ofile: $!\n"; print HTML $html, "\n"; close HTML; 0; } # # get output filename # sub get_filename { my $accno = shift; # get std path from accession number my $path = &EDGAR::util::accno2path($accno); # test data uses all zero CIK unless ($path) { # test submission has all zero CIK print STDERR " Skipping test submission: $file\n" if ($debug); return; } my $htmlpath = "$htmldir"; $htmlpath .= "/" . $path; my($ofile) = ""; if ($debug > 1) { $ofile = "debug.$accno.html"; } else { # build the path if it doesn't exist EDGAR::util::makepath($htmlpath, 0775); $ofile = "$htmlpath/$accno.html"; } $ofile; } sub CATCH_SIGALRM { my $pid = $logger::DEBUG->savedPid(); print STDERR "\n\nCaught Interrupt SIGALRM, killing pid [$pid]\n"; $logger::DEBUG->writeMessage('-message' => "Caught Interrupt SIGALRM, killing pid [$pid]"); $logger::DEBUG->haltOff(); $logger::DEBUG->outputLogger(); $logger::DEBUG->reInit(); kill 1, $pid; } sub CATCH_SIGINT { $logger::DEBUG->writeMessage('-message' => "Caught Interrupt SIGINT"); $logger::DEBUG->outputLogger(); $logger::DEBUG->shutdownLogger(); `rm -f /tmp/$prog_pid` if (-e "/tmp/$prog_pid"); die "\n\nCaught Interrupt SIGINT\n\n"; } sub version { my $ver = $VERSION . "r" . $RELEASE; if (__PACKAGE__ !~ /^main$/) { $ver = __PACKAGE__ . " " . $ver; } return $ver; }