#!/iw/perl/current/bin/perl -w # # file: edgar-correction # desc: read a *.pc file and preform the correction to # the stored *.xml and/or *.html file # eval 'exec /iw/perl/current/bin/perl -s $0 ${1+"$@"}' if 0; # limit scope...keep this first package EDGAR::correction; use strict; use vars qw($VERSION $RELEASE); # # current module version my $Id =<<'EoI'; # $Id: //depot/isms/skulker/edgar/1.2.4/bin/edgar-correction#1 $ EoI # my $RELEASE = sprintf("%d", $Id =~ /^# \$Id: .*#(\d+)/); my $VERSION = "1.2.2"; BEGIN { my $iw_root = ($ENV{IW}) ? $ENV{IW} : "/iw"; my $edgar_root = "$iw_root/skulker/edgar/current"; # where we find our local libraries my($libdir) = "$edgar_root/lib"; if ( -d $libdir) { unshift(@INC, $libdir); } # general EDGAR utility support routines require 'edgar-util.pl'; # general EDGAR library support routines require 'edgar-lib.pl'; } # we want perl 5.00x or later require 5.004; # who am i? my $prog; ($prog = $0) =~ s#.*/##; # pass file handles use Symbol; # for processing command line options use Getopt::Std; # process command line options, if any my %opt; getopts('D:b:m:l:L:vh', \%opt) || &usage($prog); # where to find submissions my $basedir = defined($opt{'b'}) ? "$opt{'b'}" : $EDGAR::lib::edgar_data; # debug mode? my $debug = defined($opt{'D'}) ? $opt{'D'} : 0; # move file my $move = defined($opt{'m'}) ? $opt{'m'} : 0; if ($move) { if (! -d "$move") { print STDERR "Error: -m [dir] The dir must exists\n\n"; &usage($prog); } if (! -w "$move") { print STDERR "Error: -m [dir] Must be able to write to the dir\n\n"; &usage($prog); } } # log file my $log = defined($opt{'l'}) ? $opt{'l'} : 0; my $log_fh = gensym(); if ($log) { $log .= "/$prog-log" if ( -d "$log"); open ($log_fh, ">>$log") || die "can not appened to log file $log: $!"; } elsif ($opt{'L'}) { if (! -d "$opt{'L'}") { print STDERR "Error: -L [dir] The dir must exists\n\n"; &usage($prog); } my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(); $year = sprintf("%02d", $year-100); $mon = sprintf("%02d", ++$mon); $mday = sprintf("%02d", $mday); $log = $opt{'L'}."/$prog-$year$mon$mday"; open ($log_fh, ">>$log") || die "can not appened to log file $log: $!"; } # print the version if ($opt{'v'}) { print &version(), "\n"; exit; } # print the usage &usage($prog) if $opt{'h'}; sub usage { my $prog = shift; print "usage: $prog [-D debug_level] [-b basedir] [-k] [-m dir] [-l file] [-L dir] [-v] [-h] correction_file\n"; print " -D : debug mode, 0-9\n"; print " -b : base directory\n"; print " -m : move original file, If a delete, move files to this dir instead of\n"; print " deleting them. If an update, move the original file here.\n"; print " Overrides the -k option\n"; print " -l : log file. Where logging information is appened.\n"; print " If a given a dir logs to /dir/$prog-log\n"; print " -L : Log file /w date. Where logging information is appened.\n"; print " Given a dir logs to /dir/YYMMDD\n"; print " -v : version, print out the version number and exit\n"; print " -h : help, print out this message and exit\n"; print "\n"; print " reads a correction file (*.pc) makes changes to the\n"; print " corresponding xml if needed, delets both the xml and html files\n"; exit; } my ($file, %new_tags, %org_tags, %tag_hash, $key); my ($file_handle, $accno, $path, $deletion, $tag); FILE: foreach $file (@ARGV) { print STDERR "Processing $file ...\n" if ($debug); if ($log) { print $log_fh "\n"; print $log_fh "Processing $file ...\n"; } # submission accession number $accno = ''; # are we doing a deletion? $deletion = 0; $file_handle = gensym(); open($file_handle, "$file") || die "$prog: error reading $file: $!\n"; # add a File Number/Act noted by the my $add_cik = 0; # delete a File Number/Act noted by the my $del_cik = 0; # add a registrant noted by my $add_file = 0; # delete a registrant noted by my $del_file = 0; # follows my $delete_whole_submission = 0; # the data structor that holds all the info %tag_hash = (); # print a warning message. I don't know what to do with these my $fatal_error = 0; # pass one find out what we are to do, and get the accno while (<$file_handle>) { if (//) { /^.*(.*?)\s*$/; $accno = $1; # make sure we have a valid accession number unless ($accno =~ /^\d+-\d\d-\d+$/) { print STDERR "Invalid accession # ($accno) " . "... skipping $file\n"; next FILE; } # check next line for $_ = <$file_handle>; if (//) { print STDERR "debug: DELETE WHOLE SUBMISSION - " . " after \n" if ($debug > 1); $delete_whole_submission = 1; last; } next; } if (//) { # NOTE this type is not known! # prints a warning message and exists # check next line for $_ = <$file_handle>; if (//) { print STDERR "debug: after \n" if ($debug > 1); ($tag_hash{$add_file}{'ADD_FILE'}{'VALUES'}, %{$tag_hash{$add_file}{'ADD_FILE'}{'KEY'}}) = &readReturnTransferValues($file_handle, "", ""); $add_file++; $fatal_error = 1; } if (//) { print STDERR "debug: after \n" if ($debug > 1); ($tag_hash{$del_file}{'DEL_FILE'}{'VALUES'}, %{$tag_hash{$del_file}{'DEL_FILE'}{'KEY'}}) = &readReturnTransferValues($file_handle, "", ""); $del_file++; $fatal_error = 1; } if (//) { print STDERR "debug: after \n" if ($debug > 1); ($tag_hash{$del_file}{'DEL_FILE'}{'VALUES'}, %{$tag_hash{$del_file}{'DEL_FILE'}{'KEY'}}) = &readReturnTransferValues($file_handle, "", ""); $del_file++; $fatal_error = 1; } next; } if (// || // || // || // || //) { $tag = $_; $tag =~ s/\s+$//; # remove \r\n (dos) $tag =~ s|^<|"; # check next line for # or $_ = <$file_handle>; if (//) { print STDERR "debug: after , etc\n" if ($debug > 1); ($tag_hash{$add_cik}{'ADD_CIK'}{'VALUES'}, %{$tag_hash{$add_cik}{'ADD_CIK'}{'KEY'}}) = &readReturnTransferValues($file_handle, $tag, $key); $add_cik++; } if (//) { print STDERR "debug: after , etc\n" if ($debug > 1); ($tag_hash{$del_cik}{'DEL_CIK'}{'VALUES'}, %{$tag_hash{$del_cik}{'DEL_CIK'}{'KEY'}}) = &readReturnTransferValues($file_handle, $tag, $key); $del_cik++; } if (//) { print STDERR "debug: after , etc\n" if ($debug > 1); ($tag_hash{$del_cik}{'DEL_CIK'}{'VALUES'}, %{$tag_hash{$del_cik}{'DEL_CIK'}{'KEY'}}) = &readReturnTransferValues($file_handle, $tag, $key); $del_cik++; } next; } } close $file_handle; if ($fatal_error) { print STDERR "\nERROR! I do not know how to handle "; print STDERR "this file [$file]\nGoing to next file... \n"; print $log_fh "ERROR! I do not know how to handle this file\n" if ($log); next; } if ($delete_whole_submission) { print $log_fh "delete whole submission\n" if ($log); &deleteWholeSubmission($accno, $basedir); next; } if (!$add_cik && !$del_cik && !$add_file && !$del_file) { # substitute specific tags print $log_fh "substitute specific tags\n" if ($log); &substituteTags($file, $accno, $basedir); next; } else { # delete and/or add tags print $log_fh "delete and/or add tags\n" if ($log); &deleteAddTags($accno, $basedir, \%tag_hash); next; } } sub addCIKtags { my $sub_aref = shift; my $sub_index = shift; my $add_aref = shift; my ($sub_tag, $add_tag); $add_aref->[$#{$add_aref}] =~ /(<[^>]+>)/; my $last_tag = $1; my $i = $sub_index; my $add_index = 0; while ($i <= $#{$sub_aref}) { $sub_aref->[$i] =~ /(<[^>]+>)/; $sub_tag = $1; if ($add_aref->[$add_index] =~ /(<[^>]+>)/) { $add_tag = $1; } else { $add_tag = ""; } if ($sub_tag eq $add_tag) { $sub_aref->[$i] = $add_aref->[$add_index]; $add_index++; $i++; } else { splice(@{$sub_aref}, $sub_index+$add_index, 0, $add_aref->[$add_index]); $add_index++; } last if ($sub_tag eq $last_tag); last if ($add_index > $#{$add_aref}); } return $i; } sub deleteCIKtags { my $sub_aref = shift; my $sub_index = shift; my $del_aref = shift; my ($sub_tag, $next_sub_tag, $del_tag, $next_del_tag); $del_aref->[$#{$del_aref}] =~ /(<[^>]+>)/; my $last_tag = $1; my $i = $sub_index; my $match = 0; my $del_index = 0; while ($i <= $#{$sub_aref}) { $sub_aref->[$i] =~ /(<[^>]+>)/; $sub_tag = $1; $del_aref->[$del_index] =~ /(<[^>]+>)/; $del_tag = $1; if ($sub_tag eq $del_tag) { splice(@{$sub_aref}, $i, 1); $del_index++; } else { splice(@{$sub_aref}, $i, 1); } last if ($sub_tag eq $last_tag); } return $i; } sub checkValidXML { my $array_ref = shift; my ($i, $j, $end_tag, $found_end_tag); for ($i=0; $i<=$#{$array_ref}; $i++) { # no end tags on these, okay next if (($array_ref->[$i] =~ //) || ($array_ref->[$i] =~ //)); # has an end tag, or is an end tag, okay # NOTE this will not correct an end tag without a start tag :( next if ($array_ref->[$i] =~ m|[$i] =~ /(^<[^>]+>)/) { $end_tag = $1; $end_tag =~ s|^<|[$j] =~ /$end_tag/) { $found_end_tag = 1; last; } } splice(@{$array_ref}, $i+1, 0, $end_tag) if (!$found_end_tag); } } } sub readReturnTransferValues { my $file_handle = shift; my $end_tag = shift; my $key = shift; my (@array, $tag, $e_tag, $element, %hash); my $found_key = 0; while (<$file_handle>) { last if (/$end_tag/); next if (// || // || //); # skip these tags s/\s*$//; if (/(<[^>]+>)([^<]+)/) { $tag = $1; $element = $2; $e_tag = $tag; $e_tag =~ s|^<| 1); print $log_fh "readReturnTransferValues: Error did not find key [$key]\n" if ($log); } return \@array, %hash; } sub readReturnSubmission { my $file_handle = shift; my @array; my $done = 0; while (<$file_handle>) { if (//) { chomp; push (@array, $_); while (<$file_handle>) { chomp; push (@array, $_); if (//) { $done = 1; last; } } last if $done; } } return @array; } sub readReturnTags { my $file_handle = shift; my %hash; while (<$file_handle>) { last if (m||); if (// || // || // || // || // || // || // || //) { s/\s*$//; if (/(<[^>]+>)([^<]+)/) { $hash{$1} = $2; } } } return %hash; } sub deleteWholeSubmission { my $accno = shift; my $basedir = shift; my $path = &EDGAR::util::accno2path($accno); my $xml_file = $basedir . "/xml/" . $path . "/" . $accno . ".xml"; my $html_file = $basedir . "/html/" .$path . "/" . $accno . ".xml"; if ($move) { print STDERR "moving $xml_file and $html_file to $move\n" if ($debug > 1); print $log_fh "moving $xml_file and $html_file to $move\n" if ($log); `mv $xml_file $move`; `mv $html_file $move` if (-e "$html_file"); } else { print STDERR "removing $xml_file and $html_file\n" if ($debug > 1); print $log_fh "removing $xml_file and $html_file\n" if ($log); `rm $xml_file`; `rm $html_file` if (-e "$html_file"); } } sub deleteAddTags { my $accno = shift; my $basedir = shift; my $hash_ref = shift; my $path = &EDGAR::util::accno2path($accno); my $org_file = $basedir . "/xml/" . $path . "/" . $accno . ".xml"; my $file_handle = gensym(); if (!open ($file_handle, "$org_file")) { print STDERR "error reading $org_file: $!\n"; print $log_fh "error reading $org_file: $!\n"; return; } my @submission = &readReturnSubmission($file_handle); seek $file_handle, 0, 0; # back to the start of the file my $i = -1; my ($count, $tag, $element); while (++$i <= $#submission) { if ($submission[$i] =~ /(<[^>]+>)([^<]+)/) { $tag = $1; $element = $2; foreach $count ( sort { $hash_ref->{$a} cmp $hash_ref->{$b} } keys %{$hash_ref}) { if (($hash_ref->{$count}{'DEL_CIK'}{'KEY'}{$tag}) && ($hash_ref->{$count}{'DEL_CIK'}{'KEY'}{$tag} == $element)) { # back up untill first tag matches while ($hash_ref->{$count}{'DEL_CIK'}{'VALUES'}[0] ne $tag) { $i--; $submission[$i] =~ /(<[^>]+>)/; $tag = $1; } if ($hash_ref->{$count}{'DEL_CIK'}{'VALUES'}[0]) { $i = &deleteCIKtags(\@submission, $i, \@{$hash_ref->{$count}{'DEL_CIK'}{'VALUES'}}); } if ($hash_ref->{$count}{'ADD_CIK'}{'VALUES'}[0]) { $i = &addCIKtags(\@submission, $i, \@{$hash_ref->{$count}{'ADD_CIK'}{'VALUES'}}); } } } } } &checkValidXML(\@submission); my $corrected_file = $org_file . ".corrected"; open (CHANGED_FILE, ">$corrected_file") || die "error creating $corrected_file: $!"; while (<$file_handle>) { if (//) { while (<$file_handle>) { last if (//); } foreach (@submission) { print CHANGED_FILE "$_\n"; } last; } else { print CHANGED_FILE; } } while (<$file_handle>) { print CHANGED_FILE; } close $file_handle; close CHANGED_FILE; if ($move) { print STDERR "correcting $org_file, coping original to $move\n" if ($debug > 1); print $log_fh "correcting $org_file, coping original to $move\n" if ($log); `mv $org_file $move`; rename $corrected_file, $org_file; } else { print STDERR "correcting $org_file\n" if ($debug > 1); print $log_fh "correcting $org_file\n" if ($log); rename $corrected_file, $org_file; } } sub substituteTags { my $correction_file = shift; my $accno = shift; my $basedir = shift; print STDERR "substituteTags: substitute specific tags...\n" if ($debug > 1); my $file_handle = gensym(); open($file_handle, "$correction_file") || die "error reading $correction_file: $!"; my %new_tags = &readReturnTags($file_handle); close $file_handle; my $path = &EDGAR::util::accno2path($accno); my $org_file = $basedir . "/xml/" . $path . "/" . $accno . ".xml"; if (!open ($file_handle, "$org_file")) { print STDERR "error reading $org_file: $!\n"; print $log_fh "error reading $org_file: $!\n"; return; } my @submission = &readReturnSubmission($file_handle); seek $file_handle, 0, 0; # back to the start of the file my ($tag, $end_tag, $i); for ($i=0; $i<=$#submission; $i++) { if ($submission[$i] =~ /(<[^>]+>)/) { $tag = $1; if (defined $new_tags{$tag}) { $end_tag = $tag; $end_tag =~ s|^<|$corrected_file") || die "error creating $corrected_file: $!"; my $done = 0; while (<$file_handle>) { if (//) { while (<$file_handle>) { last if (//); } foreach (@submission) { print CHANGED_FILE "$_\n"; } last; } else { print CHANGED_FILE; } } while (<$file_handle>) { print CHANGED_FILE; } close $file_handle; close CHANGED_FILE; if ($move) { print STDERR "correcting $org_file, coping original to $move\n" if ($debug > 1); print $log_fh "correcting $org_file, coping original to $move\n" if ($log); `mv $org_file $move`; rename $corrected_file, $org_file; } else { print STDERR "correcting $org_file\n" if ($debug > 1); print $log_fh "correcting $org_file\n" if ($log); rename $corrected_file, $org_file; } } sub version { my $ver = $VERSION . "r" . $RELEASE; if (__PACKAGE__ !~ /^main$/) { $ver = __PACKAGE__ . " " . $ver; } return $ver; }