#!/iw/perl/current/bin/perl # # file: fix-nc # desc: clean up EDGAR .nc files for XML conversion # eval 'exec /iw/perl/current/bin/perl -s $0 ${1+"$@"}' if 0; # limit scope...keep this first package EDGAR::fixnc; use strict; use vars qw($VERSION $RELEASE); use Symbol; use HTML::Parser 3; # # current module version my $Id =<<'EoI'; # $Id: fix-nc,v 1.4 2000/09/01 20:00:59 bburdick Exp $ EoI # my $RELEASE = sprintf("%d", $Id =~ /^# \$Id: .*#(\d+)/); my $VERSION = "1.2.4"; BEGIN { my $iw_root = ($ENV{IW}) ? $ENV{IW} : "/iw"; my $edgar_root = "$iw_root/skulker/edgar/current"; # where we find our local libraries my($libdir) = "$edgar_root/lib"; if ( -d $libdir) { unshift(@INC, $libdir); } # general EDGAR data types require 'edgar-lib.pl'; # general EDGAR utility support routines require 'edgar-util.pl'; } # we want perl 5.00x or later require 5.004; # who am i? my $prog; ($prog = $0) =~ s#.*/##; # UUdecode needed for PDF documents use Convert::UU qw(uudecode); # for processing command line options use Getopt::Std; # process command line options, if any my %opt; getopts('D:b:kvh', \%opt) || &usage($prog); # where to place submissions my $basedir = defined($opt{'b'}) ? "$opt{'b'}" : $EDGAR::lib::edgar_root; EDGAR::util::makepath($basedir, 0775); # debug mode? my $debug = defined($opt{'D'}) ? $opt{'D'} : 0; # keep orginal external files my $KEEP = defined($opt{'k'}) ? $opt{'k'} : 0; # print the version if ($opt{'v'}) { print &version(), "\n"; exit; } # print the usage &usage($prog) if $opt{'h'}; # HTML host name my $HTMLhost = "html.edgar.space.invisible.net"; # HTML base URI my $HTMLuri = "http://$HTMLhost/public/edgar/html"; # default DOCTYPE for EDGAR nc documents my $DOCTYPE = qq||; sub usage { my $prog = shift; print "usage: $prog [-D debug_level] [-b base_dir] [-v] [-h] input_file\n"; print " -D : debug mode, 0-9\n"; print " -b : base dir, where etc/edgarsec.dtd is found\n"; print " -k : keep, keep the original unchanged external html files, *.bak\n"; print " -v : version, print out the version number and exit\n"; print " -h : help, print out this message and exit\n"; print "\n"; print " low level processing on input_file.\n"; print " Changes are written to the output file, input_file.nc,\n"; print " in the current dir\n"; exit; } # Debugging package, currently in edgar/lib, should be in common/lib use Logger; # set up stuff for run log and Debugging my $do_logging = 0; $do_logging = 1 if ($debug > 8); $logger::DEBUG = Logger->new(); $logger::DEBUG->setupLogger('-base-dir' => "/tmp", '-base-filename' => $prog, '-do-logging' => $do_logging); $logger::DEBUG->logDateTime(); my $file; FILE: foreach $file (@ARGV) { my @submission = (); print STDERR "Processing $file ...\n" if ($debug); if (! open(IN, "$file")) { warn "$prog: error reading $file: $!\n"; next FILE; } $logger::DEBUG->logFileInfo('-file-name' => $file); # slurp in the whole file while () { chomp; # already *fixed*? # next FILE if (/^ start tag # for live edgar mainly if (//) { s/^.*//; } push(@submission, $_) ; } close(IN); # output file to use my $ofile = $file; # live feed files? # submission? $ofile =~ s/\.dissem$/\.nc/; # correction? $ofile =~ s/\.pc$/\.corr01/; $ofile .= ".fixed" if ($debug); # extract any HTML or PDF documents as they are stored separately &extractDocuments(\@submission); # clean up on input data prior to XML processing/conversion # tags for pattern matches my $edgar_tags = join("\|", keys(%EDGAR::lib::edgar_tags)); # processing a table element? my $in_tbl = 0; my $has_fds = 0; my($line, $mtag); # clean up any XML reserved characters in element tags foreach $line (@submission) { # catch any form-feeds $line =~ s/\f//g; # and carriage returns $line =~ s/\r//g; # and trailing spaces $line =~ s/ +$//; # strip any & in element tags...can't encode them while ($line =~ s/^<([^>]*?)&/<$1/g) { ; } # encode XML entities (only & for now) $line =~ s/\&/\&/g; # clean up non-element markup while ($line =~ m@<(/?)([^>]*)@g) { my $end = $1; my $tag = $2; if ($tag !~ /^($edgar_tags)$/) { $mtag = quotemeta $tag; $line =~ s/<$end$mtag/<$end$tag/; } } # special case handling -- problems seen in a few documents $line =~ s@@; $line =~ s@^\s*@; $line =~ s@^\s*@; $line =~ s@^\[FN\]@@; $line =~ s@@@g; $line =~ s@^\s*@g; $line =~ s@^\s*]+?>@@g; $line =~ s@^\s*/TABLE\s*$@@; $line =~ s@^\s*@; # special case for broken column indicators # on financial data schedules $line =~ s@^\s*\s+\s+$@
@; # special check for unclosed TABLE elements if ($line =~ m@^\s*
@) { $in_tbl = 1; } elsif ($line =~ m@^\s*
@) { $in_tbl = 0; } elsif ($line =~ m@^.+$@) { $in_tbl = 0; } # unclosed TABLE? if ($in_tbl && ($line =~ m@@)) { $line = "\n" . $line; $in_tbl = 0; } # missing SIC code? if ($line =~ m@^\s*$@) { # set to unknown SIC $line .= " "; } # FDS document? for hack below $has_fds = 1 if ($line =~ /^\s*/); } # hack for errors in element ordering...this is occurring with # enough frequency now that manual corrections are a pain. - bjb if ($has_fds) { my $subm = join("\n", @submission); # we are seeing an increase in the number of FDS documents with obvious # invalid SGML errors that are easy to correct $subm =~ s/.*?\n(\s*.*?)\n(\s*.*?)\n/\n$2\n$1\n/gs; # updated submission @submission = split(/\n/, $subm); } # need to force DTD info for initial conversion/validation unshift(@submission, $DOCTYPE); # write the (possibly) modified content open(OUT, ">$ofile") || die "$prog: error writing $ofile: $!\n"; print OUT join("\n", @submission), "\n"; close OUT; $logger::DEBUG->outputLogger(); $logger::DEBUG->reInit(); } exit 0; # # extract *special* documents from raw EDGAR filings, replacing the elements # with pointers to separately stored versions. # sub extractDocuments { my $array_ref = shift; # accession number for output filenames my($accno) = ''; # processing a DOCUMENT element? my($in_doc) = 0; # current document sequence number my($curr_doc) = 0; my($i, $line, $file_name, $new_file_name); my(%changed_files, @dublicates); # look for our *special* documents for ($i=0 ; $i <= $#{$array_ref}; $i++) { $line = $array_ref->[$i]; if (!$accno && $line =~ /\s*(\S+)\s*$/) { $accno = $1; print STDERR "** DEBUG: extract: accno: ($accno) **\n" if ($debug > 1); next; } if ($line =~ //) { $in_doc = 1; print STDERR "** DEBUG: extract: in_doc **\n" if ($debug > 1); } elsif ($in_doc) { # end of current document? if ($line =~ /<\/DOCUMENT>/) { $in_doc = 0; print STDERR "** DEBUG: extract: end doc #${curr_doc} **\n" if ($debug > 1); next; } # get document sequence number if ($line =~ /\s*(\d+)/) { $curr_doc = $1; print STDERR "** DEBUG: extract: curr_doc ($curr_doc) **\n" if ($debug > 1); next; } # get the file name if ($line =~ /^(.*?)\s*$/io) { $file_name = $1; print STDERR "** DEBUG: extract: file_name ($file_name) **\n" if ($debug > 1); if ($file_name =~ /\.htm|\.pdf|\.gif|\.jpg|\.txt/) { # extract these print STDERR "** DEBUG: extract: (call to doExtraction) **\n" if ($debug > 1); ($i, $new_file_name) = &doExtraction($i, $file_name, $accno, $curr_doc, $array_ref); $changed_files{$file_name} = $new_file_name; } } } } &fixHtmlLinks($accno, \%changed_files); 0; } # # files of extension .html .pdf .gif .jpg are "cut" out of the file # and stored to the local disk # files of extension .txt are NOT removed from the file # and are stored to the local disk # sub doExtraction { my $index = shift; my $file_name = shift; my $accno = shift; my $squence_number = shift; my $array_ref = shift; my ($tag, $start_mark, $end_mark, $extension, $data, $encoded_file, $mode); my $do_uudecode = 0; # uudecode the data my $do_extract = 1; # remove the data from the array_ref $file_name =~ /.+\.(.+)/; $extension = $1; print STDERR "** DEBUG: doExtraction: extension: ($extension) **\n" if ($debug > 1); $logger::DEBUG->current('-name' => "extract $extension", '-append' => 1); if ($extension =~ /htm/i) { $extension = "html"; # extention htm will not work $tag = "HTML"; $start_mark = ""; $end_mark = ""; } elsif ($extension =~ /pdf/i) { $tag = "PDF"; $start_mark = ""; $end_mark = ""; $do_uudecode = 1; } elsif ($extension =~ /gif/i) { $tag = "GIF"; $start_mark = "^begin .*?\.gif"; $end_mark = "end"; $do_uudecode = 1; } elsif ($extension =~ /jpg/i) { $tag = "JPG"; $start_mark = "^begin .*?\.jpg"; $end_mark = "end"; $do_uudecode = 1; } elsif ($extension =~ /txt/i) { $tag = "TEXT"; $start_mark = ""; $end_mark = ""; $do_extract = 0; } # find start tag while (++$index < $#{$array_ref}) { last if ($array_ref->[$index] =~ /$start_mark/i); } my $end = &findSectionEnd($index, $end_mark, $array_ref); print STDERR "** DEBUG: doExtraction: start index: ($index) end index: ($end) **\n" if ($debug > 1); my $length = ($end - $index) + 1; my $path = EDGAR::util::accno2path($accno); my $uri = $HTMLuri; my $file = "$accno.$squence_number.$extension"; $uri .= "/" . $path . "/" . "$file"; # extract the data my @extract; if ($do_extract) { @extract = splice(@{$array_ref}, $index, $length); $index = &removeExtraInfo($index, $array_ref); splice(@{$array_ref}, $index, 0, "<$tag>\n$uri\n"); } else { @extract = @{$array_ref}[$index..$end]; } if ($do_uudecode) { print STDERR "** DEBUG: doExtraction: (calling uudecode) **\n" if ($debug > 1); ($data, $encoded_file, $mode) = uudecode(\@extract); } else { $data = join("\n", @extract) . "\n"; } &save_extract($file, $data); $logger::DEBUG->increment('-name' => "extracted_".$extension); $logger::DEBUG->saveCurrentInfo('-type' => "extracted file name", '-info' => $file); return ($index, $file); } sub findSectionEnd { my $index = shift; my $end_tag = shift; my $array_ref = shift; # find end while (++$index < $#{$array_ref}) { last if ($array_ref->[$index] =~ /^\s*$end_tag/); if ($array_ref->[$index] =~ m||) { # should not get here, missing the end tag print STDERR "** DEBUG: findSectionEnd: (adding a missing end tag) **\n" if ($debug > 1); $logger::DEBUG->saveErrorsCurrent('-error' => "adding a missing end tag [$end_tag]"); splice(@{$array_ref}, $index, 0, $end_tag); last; } } return $index; } sub removeExtraInfo { my $index = shift; my $array_ref = shift; # go back to tag while ($index-- > 0) { last if ($array_ref->[$index] =~ //); } my $end = &findSectionEnd($index, "", $array_ref); my $start = $index+1; my @extra_data = splice(@{$array_ref}, $start, $end-$start); my $extra_data = join('', @extra_data); $extra_data =~ s/\r//g; # so we can read the extra data if ($extra_data =~ /\S/) { print STDERR "** ERROR: removeExtraInfo: has extra data ($extra_data) **\n" if ($debug > 1); $logger::DEBUG->saveErrorsCurrent('-error' => "contains extra data [$extra_data]"); } return $start; } sub fixHtmlLinks { my $accno = shift; my $hash_ref = shift; my $path = EDGAR::util::accno2path($accno); %file_hash::FIX_NC; # global $file_handle::FIX_NC; # global foreach (keys %{$hash_ref}) { $file_hash::FIX_NC{$_} = $hash_ref->{$_}; } my ($new_file, $org_file, $error); my $p = HTML::Parser->new(api_version => 3, start_h => [\&startTagHandler, "tagname,text"], default_h => [\&defaultTagHandler, "text"]); foreach $new_file (keys %{$hash_ref}) { $org_file = $hash_ref->{$new_file}; if ($new_file =~ /\.html|\.htm/) { `cp $org_file $org_file.bak`; $file_handle::FIX_NC = gensym(); open ($file_handle::FIX_NC, ">$org_file") || die "can not create file $org_file: $!"; if (!$p->parse_file("$org_file.bak")) { print STDERR "** ERROR: fixHtmlLinks: parse_file: ($!) **\n"; next; } close $file_handle::FIX_NC; `rm -f $org_file.bak` if (!$KEEP); } } } sub startTagHandler { my $tag_name = shift; my $text = shift; my ($img_src); if ($tag_name eq "img") { $text =~ /SRC\s*=\s*"?([^" >]+)/i; $img_src = $1; if ($file_hash::FIX_NC{$img_src}) { $text =~ s/$img_src/$file_hash::FIX_NC{$img_src}/; } } my ($html_src); if ($tag_name eq "a") { if ($text =~ /HREF\s*=\s*"?([^" >]+)/i) { $html_src = $1; if ($file_hash::FIX_NC{$html_src}) { $text =~ s/$html_src/$file_hash::FIX_NC{$html_src}/; } } } print $file_handle::FIX_NC "$text"; } sub defaultTagHandler { my $text = shift; print $file_handle::FIX_NC "$text"; } # # save contents to specified file # sub save_extract { my($file) = shift; my($contents) = shift; print STDERR "** DEBUG: save: file: ($file) **\n" if ($debug > 1); open(OUT, ">$file") || return -1; print OUT $contents; close OUT; 0; } sub version { my $ver = $VERSION . "r" . $RELEASE; if (__PACKAGE__ !~ /^main$/) { $ver = __PACKAGE__ . " " . $ver; } return $ver; }