#!/iw/perl/current/bin/perl # # file: fix-xml # desc: post-fix XML conversion results # eval 'exec /iw/perl/current/bin/perl -s $0 ${1+"$@"}' if 0; # limit scope...keep this first package EDGAR::fixxml; use strict; use vars qw($VERSION $RELEASE); # # current module version my $Id =<<'EoI'; # $Id: //depot/isms/skulker/edgar/1.2.4/bin/fix-xml#5 $ EoI # my $RELEASE = sprintf("%d", $Id =~ /^# \$Id: .*#(\d+)/); my $VERSION = "1.2.4"; BEGIN { my $iw_root = ($ENV{IW}) ? $ENV{IW} : "/iw"; my $edgar_root = "$iw_root/skulker/edgar/current"; # where we find our local libraries my($libdir) = "$edgar_root/lib"; if ( -d $libdir) { unshift(@INC, $libdir); } # EDGAR XML support require 'edgar-util.pl'; } # we want perl 5.00x or later require 5.004; # who am i? my $prog; ($prog = $0) =~ s#.*/##; # for processing command line options use Getopt::Std; # process command line options, if any my %opt; getopts('D:vh', \%opt) || &usage($prog); # debug mode? my $debug = defined($opt{'D'}) ? $opt{'D'} : 0; # print the version if ($opt{'v'}) { print &version(), "\n"; exit; } # print the usage &usage($prog) if $opt{'h'}; sub usage { my $prog = shift; print "usage: $prog [-D debug_level] [-v] [-h] input_file\n"; print " -D : debug mode, 0-9\n"; print " -v : version, print out the version number and exit\n"; print " -h : help, print out this message and exit\n"; print "\n"; exit; } # Debugging package, currently in edgar/lib, should be in common/lib use Logger; # set up stuff for run log and Debugging my $do_logging = 0; $do_logging = 1 if ($debug > 8); $logger::DEBUG = Logger->new(); $logger::DEBUG->setupLogger('-base-dir' => "/tmp", '-base-filename' => $prog, '-do-logging' => $do_logging); $logger::DEBUG->logDateTime(); foreach my $file (@ARGV) { my(@submission) = (); my $array_ref; print STDERR "Processing $file ...\n" if ($debug); open(IN, "$file") || die "$prog: error reading $file: $!\n"; # slurp in the whole file while () { chomp; # remove XML declaration tag next if (/^<\?xml version=/); # don't mess with TABLE column indicators for now unless (m@<(C|S)/?>@) { s@><@>\n<@g; # clean up tags at start/end of lines s@^<(/?)([^>]+)>(.+)$@<$1$2>\n$3@g; s@^(.+)<(/?)([^>]+)>$@$1\n<$2$3>@g; } # special case for TABLE elements s@(\s*)<(/CAPTION)><(C|S)(/?)>@<$2>\n$1<$3$4>@g; s@(CAPTION|TABLE)><(C|S)(/?)>@$1>\n<$2$3>@g; s@^<(CAPTION|TABLE)>(.+)$@<$1>\n$2@g; s@^(.+)$@\n$1@g; s@()@\n$1@; s@><(C|S)@>\n<$1@g; # don't lose blank lines on split below s@^$@ @gm; push(@submission, split(/\n/)) ; } close(IN); $logger::DEBUG->logFileInfo('-file-name' => $file); # output file to use my $ofile = $file; $ofile .= ".fixed" if ($debug); # additional clean up $array_ref = &addTagsToDocument(\@submission); # write the (possibly) modified content open(OUT, ">$ofile") || die "$prog: error writing $ofile: $!\n"; print OUT join("\n", @{$array_ref}), "\n"; close OUT; $logger::DEBUG->outputLogger(); $logger::DEBUG->reInit(); } exit 0; sub addTagsToDocument { my $array_ref = shift; my @array; my $ref; # split the file into documents ... my $document_ref = &splitDocuments($array_ref); for my $i (0 .. $#{$document_ref}) { &tagDocument($document_ref->[$i]); # back into a single array push(@array, @{$document_ref->[$i]}); } return (\@array); } sub splitDocuments { my $array_ref = shift; my @array_of_arrays; my $array_cnt = 0; my $index = -1; while (++$index < $#{$array_ref}) { if ($array_ref->[$index] =~ m@@) { @{$array_of_arrays[$array_cnt]} = splice(@{$array_ref}, 0, $index+1); $array_cnt++; $index = -1; $logger::DEBUG->current('-name' => "document" . $array_cnt); } } # add whatever is left over, should be only @{$array_of_arrays[$array_cnt]} = @{$array_ref}; return \@array_of_arrays; } # # tag document PARTs and ITEMs # sub tagDocument { my $array_ref = shift; my $sequence_number = &getSequenceNumber($array_ref); my %tags = &makeTagsForThisDocument($array_ref, $sequence_number); &insertTags($array_ref, \%tags); } sub getSequenceNumber { my $array_ref = shift; my $index = -1; while (++$index < $#{$array_ref}) { if ($array_ref->[$index] =~ m@\s*(\d+)$@) { return ($1); } } # do not panic! # no sequence number should mean that are at the end of the # document, outside of the .. block $logger::DEBUG->saveErrorsCurrent('-error' => "no sequence number"); return (1); } # # runs through a document # finds elements to tag # and adds them to the tag_hash # sub makeTagsForThisDocument { my $array_ref = shift; my $curr_doc = shift; # the current table number my $curr_tbl = 1; my ($line, $tag_hash, $section, $section_part); my ($description, $tag, $tag_id, $src); my $index = -1; while (++$index < $#{$array_ref}) { $line = $array_ref->[$index]; if (($line =~ /^\s*TABLE\s+OF\s+CONTENTS/i) || ($line =~ /^\s*CONTENTS\s*$/i) || ($line =~ /INDEX/)) { ($index) = &readAndSkipToC($index, $array_ref); next; } if ($line =~ //) { ($index, $curr_tbl, $tag_hash) = &tagTable($index, $array_ref, $curr_doc, $curr_tbl, $tag_hash); $logger::DEBUG->increment('-name' => "Document Table"); next; } ########################## ## look for a PART line ## ########################## # PART with description if ($line =~ m@^\s*PART\s+([A-Z]|\d+|[IVX]+|F/S)[\.:]?\s+(.*)$@i) { $section = $1; $description = $2; $description = &cleanUpDescription($description); $tag_id = "part_" . $section; if ($tag_hash->{$tag_id}) { if (&duplicateKeepOld($array_ref, $index, $tag_hash->{$tag_id}{'LOCATION'})) { next; } } $tag = &makeTag('-type' => "part", '-doc_number' => $curr_doc, '-sequence' => $section, '-description' => $description); $tag_hash->{$tag_id}{'LOCATION'} = $index; $tag_hash->{$tag_id}{'TAG'} = $tag; # keep the part sections for item $section_part = $section; $logger::DEBUG->increment('-name' => "Parts with description"); next; } # PART with no description if ($line =~ m@^\s*PART\s+([A-Z]|\d+|[IVX]+|F/S)\s*$@i) { $section = $1; $tag_id = "part_" . $section; if ($tag_hash->{$tag_id}) { if (&duplicateKeepOld($array_ref, $index, $tag_hash->{$tag_id}{'LOCATION'})) { next; } } $tag = &makeTag('-type' => "part", '-doc_number' => $curr_doc, '-sequence' => $section); $tag_hash->{$tag_id}{'LOCATION'} = $index; $tag_hash->{$tag_id}{'TAG'} = $tag; # keep the part sections for item $section_part = $section; $logger::DEBUG->increment('-name' => "Parts with NO description"); next; } ########################## ## look for a ITEM line ## ########################## if ($line =~ /^\s*ITEM\s+(\d+)[\.:]?\s+(.*)$/i) { $section = $1; $description = $2; $description = &cleanUpDescription($description); $tag_id = "item_" . $section; $tag_id .= "_part_" . $section_part if ($section_part); if ($tag_hash->{$tag_id}) { if (&duplicateKeepOld($array_ref, $index, $tag_hash->{$tag_id}{'LOCATION'})) { next; } } $tag = &makeTag('-type' => "item", '-doc_number' => $curr_doc, '-sequence' => $section, '-description' => $description, '-inside_part' => $section_part); $tag_hash->{$tag_id}{'LOCATION'} = $index; $tag_hash->{$tag_id}{'TAG'} = $tag; $logger::DEBUG->increment('-name' => "Item"); next; } ########################## ## look for a NOTE line ## ########################## if ($line =~ /^\s*NOTE\s+([A-Z]|\d+|[IVX]+)[\.:-]?\s+(.*)$/) { $section = $1; $description = $2; $description = &cleanUpDescription($description); $tag_id = "note_" . $section; if ($tag_hash->{$tag_id}) { if (&duplicateKeepOld($array_ref, $index, $tag_hash->{$tag_id}{'LOCATION'})) { next; } } $tag = &makeTag('-type' => "note", '-doc_number' => $curr_doc, '-sequence' => $section, '-description' => $description); $tag_hash->{$tag_id}{'LOCATION'} = $index; $tag_hash->{$tag_id}{'TAG'} = $tag; $logger::DEBUG->increment('-name' => "Note"); next; } ########################### ## look for an HTML line ## ########################### if ($line =~ //) { if ($line =~ m@([^<]+)@) { $src = $1; $src = ""; splice(@{$array_ref}, $index, 1, $src); } elsif ($array_ref->[$index+1] =~ m@([^<]+)@) { $src = $1; $src = ""; splice(@{$array_ref}, $index, 2, $src); } else { print STDERR "ERROR: did no get html src from [$line]\n" if ($debug > 1); $logger::DEBUG->saveErrorsCurrent('-error' => "ERROR: did no get html src from [$line]"); } $logger::DEBUG->increment('-name' => "HTML"); } ########################## ## look for an PDF line ## ########################## if ($line =~ //) { if ($line =~ m@([^<]+)@) { $src = $1; $src = ""; splice(@{$array_ref}, $index, 1, $src); } elsif ($array_ref->[$index+1] =~ m@([^<]+)@) { $src = $1; $src = ""; splice(@{$array_ref}, $index, 2, $src); } else { print STDERR "ERROR: did no get pdf src from [$line]\n" if ($debug > 1); $logger::DEBUG->saveErrorsCurrent('-error' => "ERROR: did no get pdf src from [$line]"); } $logger::DEBUG->increment('-name' => "PDF"); } ################################ ## look for a GIF or JPG line ## ################################ if ($line =~ /<(GIF|JPG)>/) { my $tag = $1; if ($line =~ m@<$tag>([^<]+)@) { $src = $1; $src = ""; splice(@{$array_ref}, $index, 1, $src); } elsif ($array_ref->[$index+1] =~ m@([^<]+)@) { $src = $1; $src = ""; splice(@{$array_ref}, $index, 2, $src); } else { print STDERR "ERROR: did no get img src from [$line]\n" if ($debug > 1); $logger::DEBUG->saveErrorsCurrent('-error' => "ERROR: did no get img src from [$line]"); } $logger::DEBUG->increment('-name' => $tag); } } if (defined %{$tag_hash} ) { return %{$tag_hash}; } else { return (); } } # # makes tags # below are the main type of tags # note that table has an "end tag" and a document(?) # and that item tag is "inside" a part # # # # # # sub makeTag { my %params = @_; my $type = $params{'-type'}; my $doc_number = $params{'-doc_number'}; my $sequence = $params{'-sequence'}; my $description = $params{'-description'}; my $end_tag = $params{'-end_tag'}; my $inside_part = $params{'-inside_part'}; my $id = "id=\"document_" . $doc_number; if ($inside_part) { $id .= "_part_" . $inside_part; } $id .= "_" . $type . "_" . $sequence . "\" "; $sequence = &EDGAR::util::encode_av($sequence); $description = &EDGAR::util::encode_av($description); my $tag = ""; } else { $tag .= ">"; } return $tag; } # # only tag the start of a table. No tags are placed inside of a table, # except for the nested table. # skip to the end of the table # sub tagTable { my $index = shift; my $array_ref = shift; my $curr_doc = shift; my $curr_tbl = shift; my $tag_hash = shift; my $tag = &makeTag('-type' => 'table', '-doc_number' => $curr_doc, '-sequence' => $curr_tbl, '-end_tag' => 1); my $tag_id = "table_" . $curr_tbl; $tag_hash->{$tag_id}{'LOCATION'} = $index; $tag_hash->{$tag_id}{'TAG'} = $tag; # leave at
tag my $did_col = 0; while ($array_ref->[++$index] !~ m@@) { next if ($array_ref->[$index] =~ /^\s+$/); # column indicator line if ($array_ref->[$index] =~ m@^\s*[ ]+$@) { if ($did_col) { # nested pseudo-table not allowed but we'll # fake it for now $curr_tbl += 0.1; $tag = &makeTag('-type' => "table", '-doc_number' => $curr_doc, '-sequence' => $curr_tbl, '-end_tag' => 1); $tag_id = "table_" . $curr_tbl; $tag_hash->{$tag_id}{'LOCATION'} = $index; $tag_hash->{$tag_id}{'TAG'} = $tag; $logger::DEBUG->increment('-name' => "Nest Document Table"); } $did_col = 1; } } $curr_tbl = int(++$curr_tbl); # increment and int the counter return ($index, $curr_tbl, $tag_hash); } # # clean up the descriptions (Item descr., Note descr., Part descr.) # sub cleanUpDescription { my $description = shift; $description =~ s/\d+\s*$//; # ends with a page number $description =~ s/\.\.+//; # has dots in it $description =~ s/^-//; # starts with a hypen $description =~ s/^\s+|\s+$//g; # leading or trailing spaces return $description; } # # picks the (hopefully) correct tagged element # returns 1 to keep the old tagged element (closer to the top of the doc) # returns 0 to change to a new tagged element (closer to the end of the doc) # sub duplicateKeepOld { my $array_ref = shift; my $new_location = shift; my $old_location = shift; # the one with the most blank lines wins my $old_blank_lines = 0; my $new_blank_lines = 0; # test the lines above and below for empty lines $old_blank_lines++ if ($array_ref->[$old_location-1] =~ /^\s+$/); $old_blank_lines++ if ($array_ref->[$old_location+1] =~ /^\s+$/); $new_blank_lines++ if ($array_ref->[$new_location-1] =~ /^\s+$/); $new_blank_lines++ if ($array_ref->[$new_location+1] =~ /^\s+$/); if ($old_blank_lines > $new_blank_lines) { return 1; } elsif ($new_blank_lines > $old_blank_lines) { return 0; } # test to see if the new one is a CONTINUE return 1 if ($array_ref->[$new_location] =~ /CONTINUE/i); return 1 if ($array_ref->[$new_location] =~ /Contd/); # test the second line as well return 1 if ($array_ref->[$new_location+1] =~ /CONTINUE/i); return 1 if ($array_ref->[$new_location+1] =~ /Contd/); # most leading spaces win my $new_leading_spaces = 0; if ($array_ref->[$new_location] =~ /^(\s+)\S/) { $new_leading_spaces = length($1); } my $old_leading_spaces = 0; if ($array_ref->[$old_location] =~ /^(\s+)\S/) { $old_leading_spaces = length($1); } if ($old_leading_spaces > $new_leading_spaces) { return 1; } elsif ($new_leading_spaces > $old_leading_spaces) { return 0; } # the shortest one wins my $no_spaces = $array_ref->[$old_location]; $no_spaces =~ s/^\s+|\s+$//; my $old_length = length($no_spaces); $no_spaces = $array_ref->[$new_location]; $no_spaces =~ s/^\s+|\s+$//; my $new_length = length($no_spaces); if ($old_length < $new_length) { return 1; } elsif ($new_length < $old_length) { return 0; } # the one with the most capitial letters win my $old_num_cap = ($array_ref->[$old_location] =~ tr/[A-Z]/[A-Z]/); my $new_num_cap = ($array_ref->[$new_location] =~ tr/[A-Z]/[A-Z]/); if ($old_num_cap > $new_num_cap) { return 1; } elsif ($new_num_cap > $old_num_cap) { return 0; } if ($debug > 5) { print STDERR "duplicateKeepOld: Its a Tie! use the old one\n"; print STDERR "duplicateKeepOld old --> [$array_ref->[$old_location]]\n"; print STDERR "duplicateKeepOld new --> [$array_ref->[$new_location]]\n"; } $logger::DEBUG->saveErrorsCurrent('-error' => "duplicateKeepOld: Its a Tie! use the old one\n". "duplicateKeepOld old --> [$array_ref->[$old_location]]\n". "duplicateKeepOld new --> [$array_ref->[$new_location]]\n"); return 1; } # # splice tags from the tag_hash into the document_array # add end tags to tags that needed the (do not end with />) # sub insertTags { my $array_ref = shift; my $hash_ref = shift; my ($tag_id, $location, $tag, $index, $end_tag); # go from the bottom to the top foreach $tag_id (sort { $hash_ref->{$b}{'LOCATION'} <=> $hash_ref->{$a}{'LOCATION'} } keys %{$hash_ref}) { $location = $hash_ref->{$tag_id}{'LOCATION'}; $tag = $hash_ref->{$tag_id}{'TAG'}; if ($hash_ref->{$tag_id}{'TAG'} =~ m@/>$@) { splice(@{$array_ref}, $location, 0, $tag); } else { # make end tag $tag_id =~ /^([^_]+)_/; $end_tag = $1; $end_tag = ""; splice(@{$array_ref}, $location+1, 0, $end_tag); splice(@{$array_ref}, $location, 0, $tag); } } } # # prints out what is stored in nested hash str # sub dumpHashDebug { my $hash_ref = shift; my $indent = shift; my $key; $indent = " " if !$indent; foreach $key (keys %{$hash_ref}) { print STDERR $indent, "$key <=> $hash_ref->{$key}\n"; if ($hash_ref->{$key} =~ /HASH/) { &dumpHashDebug($hash_ref->{$key}, $indent . $indent); } } print STDERR "\n"; } # # some Table of Contents have line like # Item 1. This is Item One ....... 1 # Item 2. This is Item Two ....... 28 # the above lines at NOT the ITEM's we want to tag! # the ITEM's we want to tag are # Page 1. # Item 1. This is Item One # # then some dribble about what Item one is and how great and import # this Item is... # # We try and skip the ToC and tag the real ITEM's # But while we are in the ToC grap some info, 'cause we can # sub readAndSkipToC { my $index = shift; my $array_ref = shift; my $org_index = $index; my (%toc_hash, $cnt, $line, $did_something, $part, $description); my ($sequence, $last_action, $did_item, $check_item_cont); while (++$index < $#{$array_ref}) { $line = $array_ref->[$index]; if (!$line || ($line =~ /^\s+$/)) { $did_item = 0; next; } $did_something = 1; # PART with description if ($line =~ m@^\s*PART\s+([A-Z]|\d+|[IVX]+|F/S)[\.:]?\s+(.*)$@i) { $part = $1; $description = $2; last if ($toc_hash{'PART'}{$part}); $description = &cleanUpDescription($description); $toc_hash{'PART'}{$part} = $description; # PART with no description } elsif ($line =~ m@^\s*PART\s+([A-Z]|\d+|[IVX]+|F/S)\s*$@i) { $part = $1; last if ($toc_hash{'PART'}{$part}); $toc_hash{'PART'}{$part} = "NO_DESCRIPTION"; # ITEM } elsif ($line =~ /^\s*ITEM\s+(\d+[A-Z]?)[\.:]?\s?(.*)$/i) { $sequence = $1; $description = $2; last if ($toc_hash{'ITEM'}{$part}{$sequence}); $description = &cleanUpDescription($description); $part = 1 if (!$part); $toc_hash{'ITEM'}{$part}{$sequence} = $description; $did_item = 1; # see if this is a continued item } elsif ($check_item_cont && ($line =~ /^\s+(.*)$/)) { $description = $1; $description = &cleanUpDescription($description); $toc_hash{'ITEM'}{$part}{$sequence} .= " ".$description; $did_item = 1; # mulit line cont? # keep track if we did not do something } else { $did_something = 0; $cnt++; } $check_item_cont = $did_item ? 1 : 0; if ($did_something) { # go back to here when we leave $last_action = $index; $cnt = 0; } elsif ($cnt > 10) { # quit last; } } $last_action++; if ($debug > 5) { print STDERR "** readAndSkipToC: Dump of the ToC hash\n"; &dumpHashDebug(\%toc_hash); print STDERR "** readAndSkipToC: End of the ToC dump\n\n"; } if (!%toc_hash) { return ($org_index); } # need a way to pick the better ToC hash or merge the two hashes # together. Example would be a break inside of a ToC # NOTE at this time nothing is done with the %toc_hash $logger::DEBUG->increment('-name' => "Table Of Contents"); return ($last_action); } sub version { my $ver = $VERSION . "r" . $RELEASE; if (__PACKAGE__ !~ /^main$/) { $ver = __PACKAGE__ . " " . $ver; } return $ver; }