#!/iw/perl/current/bin/perl
#
# file: fix-xml
# desc: post-fix XML conversion results
#
eval 'exec /iw/perl/current/bin/perl -s $0 ${1+"$@"}'
if 0;
# limit scope...keep this first
package EDGAR::fixxml;
use strict;
use vars qw($VERSION $RELEASE);
#
# current module version
my $Id =<<'EoI';
# $Id: //depot/isms/skulker/edgar/1.2.4/bin/fix-xml#5 $
EoI
#
my $RELEASE = sprintf("%d", $Id =~ /^# \$Id: .*#(\d+)/);
my $VERSION = "1.2.4";
BEGIN {
my $iw_root = ($ENV{IW}) ? $ENV{IW} : "/iw";
my $edgar_root = "$iw_root/skulker/edgar/current";
# where we find our local libraries
my($libdir) = "$edgar_root/lib";
if ( -d $libdir) {
unshift(@INC, $libdir);
}
# EDGAR XML support
require 'edgar-util.pl';
}
# we want perl 5.00x or later
require 5.004;
# who am i?
my $prog;
($prog = $0) =~ s#.*/##;
# for processing command line options
use Getopt::Std;
# process command line options, if any
my %opt;
getopts('D:vh', \%opt) || &usage($prog);
# debug mode?
my $debug = defined($opt{'D'}) ? $opt{'D'} : 0;
# print the version
if ($opt{'v'}) {
print &version(), "\n";
exit;
}
# print the usage
&usage($prog) if $opt{'h'};
sub usage {
my $prog = shift;
print "usage: $prog [-D debug_level] [-v] [-h] input_file\n";
print " -D : debug mode, 0-9\n";
print " -v : version, print out the version number and exit\n";
print " -h : help, print out this message and exit\n";
print "\n";
exit;
}
# Debugging package, currently in edgar/lib, should be in common/lib
use Logger;
# set up stuff for run log and Debugging
my $do_logging = 0;
$do_logging = 1 if ($debug > 8);
$logger::DEBUG = Logger->new();
$logger::DEBUG->setupLogger('-base-dir' => "/tmp", '-base-filename' => $prog,
'-do-logging' => $do_logging);
$logger::DEBUG->logDateTime();
foreach my $file (@ARGV) {
my(@submission) = ();
my $array_ref;
print STDERR "Processing $file ...\n"
if ($debug);
open(IN, "$file") || die "$prog: error reading $file: $!\n";
# slurp in the whole file
while () {
chomp;
# remove XML declaration tag
next if (/^<\?xml version=/);
# don't mess with TABLE column indicators for now
unless (m@<(C|S)/?>@) {
s@><@>\n<@g;
# clean up tags at start/end of lines
s@^<(/?)([^>]+)>(.+)$@<$1$2>\n$3@g;
s@^(.+)<(/?)([^>]+)>$@$1\n<$2$3>@g;
}
# special case for TABLE elements
s@(\s*)<(/CAPTION)><(C|S)(/?)>@<$2>\n$1<$3$4>@g;
s@(CAPTION|TABLE)><(C|S)(/?)>@$1>\n<$2$3>@g;
s@^<(CAPTION|TABLE)>(.+)$@<$1>\n$2@g;
s@^(.+)(CAPTION|TABLE)>$@$2>\n$1@g;
s@()@\n$1@;
s@><(C|S)@>\n<$1@g;
# don't lose blank lines on split below
s@^$@ @gm;
push(@submission, split(/\n/)) ;
}
close(IN);
$logger::DEBUG->logFileInfo('-file-name' => $file);
# output file to use
my $ofile = $file;
$ofile .= ".fixed" if ($debug);
# additional clean up
$array_ref = &addTagsToDocument(\@submission);
# write the (possibly) modified content
open(OUT, ">$ofile") || die "$prog: error writing $ofile: $!\n";
print OUT join("\n", @{$array_ref}), "\n";
close OUT;
$logger::DEBUG->outputLogger();
$logger::DEBUG->reInit();
}
exit 0;
sub addTagsToDocument {
my $array_ref = shift;
my @array;
my $ref;
# split the file into documents ...
my $document_ref = &splitDocuments($array_ref);
for my $i (0 .. $#{$document_ref}) {
&tagDocument($document_ref->[$i]);
# back into a single array
push(@array, @{$document_ref->[$i]});
}
return (\@array);
}
sub splitDocuments {
my $array_ref = shift;
my @array_of_arrays;
my $array_cnt = 0;
my $index = -1;
while (++$index < $#{$array_ref}) {
if ($array_ref->[$index] =~ m@@) {
@{$array_of_arrays[$array_cnt]} =
splice(@{$array_ref}, 0, $index+1);
$array_cnt++;
$index = -1;
$logger::DEBUG->current('-name' =>
"document" . $array_cnt);
}
}
# add whatever is left over, should be only
@{$array_of_arrays[$array_cnt]} = @{$array_ref};
return \@array_of_arrays;
}
#
# tag document PARTs and ITEMs
#
sub tagDocument {
my $array_ref = shift;
my $sequence_number = &getSequenceNumber($array_ref);
my %tags = &makeTagsForThisDocument($array_ref, $sequence_number);
&insertTags($array_ref, \%tags);
}
sub getSequenceNumber {
my $array_ref = shift;
my $index = -1;
while (++$index < $#{$array_ref}) {
if ($array_ref->[$index] =~ m@\s*(\d+)$@) {
return ($1);
}
}
# do not panic!
# no sequence number should mean that are at the end of the
# document, outside of the .. block
$logger::DEBUG->saveErrorsCurrent('-error' => "no sequence number");
return (1);
}
#
# runs through a document
# finds elements to tag
# and adds them to the tag_hash
#
sub makeTagsForThisDocument {
my $array_ref = shift;
my $curr_doc = shift;
# the current table number
my $curr_tbl = 1;
my ($line, $tag_hash, $section, $section_part);
my ($description, $tag, $tag_id, $src);
my $index = -1;
while (++$index < $#{$array_ref}) {
$line = $array_ref->[$index];
if (($line =~ /^\s*TABLE\s+OF\s+CONTENTS/i) ||
($line =~ /^\s*CONTENTS\s*$/i) ||
($line =~ /INDEX/)) {
($index) = &readAndSkipToC($index, $array_ref);
next;
}
if ($line =~ //) {
($index, $curr_tbl, $tag_hash) = &tagTable($index,
$array_ref,
$curr_doc,
$curr_tbl,
$tag_hash);
$logger::DEBUG->increment('-name' => "Document Table");
next;
}
##########################
## look for a PART line ##
##########################
# PART with description
if ($line =~ m@^\s*PART\s+([A-Z]|\d+|[IVX]+|F/S)[\.:]?\s+(.*)$@i) {
$section = $1;
$description = $2;
$description = &cleanUpDescription($description);
$tag_id = "part_" . $section;
if ($tag_hash->{$tag_id}) {
if (&duplicateKeepOld($array_ref, $index,
$tag_hash->{$tag_id}{'LOCATION'})) {
next;
}
}
$tag = &makeTag('-type' => "part",
'-doc_number' => $curr_doc,
'-sequence' => $section,
'-description' => $description);
$tag_hash->{$tag_id}{'LOCATION'} = $index;
$tag_hash->{$tag_id}{'TAG'} = $tag;
# keep the part sections for item
$section_part = $section;
$logger::DEBUG->increment('-name' =>
"Parts with description");
next;
}
# PART with no description
if ($line =~ m@^\s*PART\s+([A-Z]|\d+|[IVX]+|F/S)\s*$@i) {
$section = $1;
$tag_id = "part_" . $section;
if ($tag_hash->{$tag_id}) {
if (&duplicateKeepOld($array_ref, $index,
$tag_hash->{$tag_id}{'LOCATION'})) {
next;
}
}
$tag = &makeTag('-type' => "part",
'-doc_number' => $curr_doc,
'-sequence' => $section);
$tag_hash->{$tag_id}{'LOCATION'} = $index;
$tag_hash->{$tag_id}{'TAG'} = $tag;
# keep the part sections for item
$section_part = $section;
$logger::DEBUG->increment('-name' =>
"Parts with NO description");
next;
}
##########################
## look for a ITEM line ##
##########################
if ($line =~ /^\s*ITEM\s+(\d+)[\.:]?\s+(.*)$/i) {
$section = $1;
$description = $2;
$description = &cleanUpDescription($description);
$tag_id = "item_" . $section;
$tag_id .= "_part_" . $section_part if ($section_part);
if ($tag_hash->{$tag_id}) {
if (&duplicateKeepOld($array_ref, $index,
$tag_hash->{$tag_id}{'LOCATION'})) {
next;
}
}
$tag = &makeTag('-type' => "item",
'-doc_number' => $curr_doc,
'-sequence' => $section,
'-description' => $description,
'-inside_part' => $section_part);
$tag_hash->{$tag_id}{'LOCATION'} = $index;
$tag_hash->{$tag_id}{'TAG'} = $tag;
$logger::DEBUG->increment('-name' => "Item");
next;
}
##########################
## look for a NOTE line ##
##########################
if ($line =~ /^\s*NOTE\s+([A-Z]|\d+|[IVX]+)[\.:-]?\s+(.*)$/) {
$section = $1;
$description = $2;
$description = &cleanUpDescription($description);
$tag_id = "note_" . $section;
if ($tag_hash->{$tag_id}) {
if (&duplicateKeepOld($array_ref, $index,
$tag_hash->{$tag_id}{'LOCATION'})) {
next;
}
}
$tag = &makeTag('-type' => "note",
'-doc_number' => $curr_doc,
'-sequence' => $section,
'-description' => $description);
$tag_hash->{$tag_id}{'LOCATION'} = $index;
$tag_hash->{$tag_id}{'TAG'} = $tag;
$logger::DEBUG->increment('-name' => "Note");
next;
}
###########################
## look for an HTML line ##
###########################
if ($line =~ //) {
if ($line =~ m@([^<]+)@) {
$src = $1;
$src = "
";
splice(@{$array_ref}, $index, 1, $src);
} elsif ($array_ref->[$index+1] =~ m@([^<]+)@) {
$src = $1;
$src = "";
splice(@{$array_ref}, $index, 2, $src);
} else {
print STDERR "ERROR: did no get html src from [$line]\n"
if ($debug > 1);
$logger::DEBUG->saveErrorsCurrent('-error' =>
"ERROR: did no get html src from [$line]");
}
$logger::DEBUG->increment('-name' => "HTML");
}
##########################
## look for an PDF line ##
##########################
if ($line =~ //) {
if ($line =~ m@([^<]+)@) {
$src = $1;
$src = "";
splice(@{$array_ref}, $index, 1, $src);
} elsif ($array_ref->[$index+1] =~ m@([^<]+)@) {
$src = $1;
$src = "";
splice(@{$array_ref}, $index, 2, $src);
} else {
print STDERR "ERROR: did no get pdf src from [$line]\n"
if ($debug > 1);
$logger::DEBUG->saveErrorsCurrent('-error' =>
"ERROR: did no get pdf src from [$line]");
}
$logger::DEBUG->increment('-name' => "PDF");
}
################################
## look for a GIF or JPG line ##
################################
if ($line =~ /<(GIF|JPG)>/) {
my $tag = $1;
if ($line =~ m@<$tag>([^<]+)$tag>@) {
$src = $1;
$src = "
";
splice(@{$array_ref}, $index, 1, $src);
} elsif ($array_ref->[$index+1] =~ m@([^<]+)$tag>@) {
$src = $1;
$src = "
";
splice(@{$array_ref}, $index, 2, $src);
} else {
print STDERR "ERROR: did no get img src from [$line]\n"
if ($debug > 1);
$logger::DEBUG->saveErrorsCurrent('-error' =>
"ERROR: did no get img src from [$line]");
}
$logger::DEBUG->increment('-name' => $tag);
}
}
if (defined %{$tag_hash} ) {
return %{$tag_hash};
} else {
return ();
}
}
#
# makes tags
# below are the main type of tags
# note that table has an "end tag" and a document(?)
# and that item tag is "inside" a part
#
#
#
#
#
#
sub makeTag {
my %params = @_;
my $type = $params{'-type'};
my $doc_number = $params{'-doc_number'};
my $sequence = $params{'-sequence'};
my $description = $params{'-description'};
my $end_tag = $params{'-end_tag'};
my $inside_part = $params{'-inside_part'};
my $id = "id=\"document_" . $doc_number;
if ($inside_part) {
$id .= "_part_" . $inside_part;
}
$id .= "_" . $type . "_" . $sequence . "\" ";
$sequence = &EDGAR::util::encode_av($sequence);
$description = &EDGAR::util::encode_av($description);
my $tag = "";
} else {
$tag .= ">";
}
return $tag;
}
#
# only tag the start of a table. No tags are placed inside of a table,
# except for the nested table.
# skip to the end of the table
#
sub tagTable {
my $index = shift;
my $array_ref = shift;
my $curr_doc = shift;
my $curr_tbl = shift;
my $tag_hash = shift;
my $tag = &makeTag('-type' => 'table', '-doc_number' => $curr_doc,
'-sequence' => $curr_tbl, '-end_tag' => 1);
my $tag_id = "table_" . $curr_tbl;
$tag_hash->{$tag_id}{'LOCATION'} = $index;
$tag_hash->{$tag_id}{'TAG'} = $tag;
# leave at tag
my $did_col = 0;
while ($array_ref->[++$index] !~ m@@) {
next if ($array_ref->[$index] =~ /^\s+$/);
# column indicator line
if ($array_ref->[$index] =~ m@^\s*[ ]+$@) {
if ($did_col) {
# nested pseudo-table not allowed but we'll
# fake it for now
$curr_tbl += 0.1;
$tag = &makeTag('-type' => "table",
'-doc_number' => $curr_doc,
'-sequence' => $curr_tbl,
'-end_tag' => 1);
$tag_id = "table_" . $curr_tbl;
$tag_hash->{$tag_id}{'LOCATION'} = $index;
$tag_hash->{$tag_id}{'TAG'} = $tag;
$logger::DEBUG->increment('-name' =>
"Nest Document Table");
}
$did_col = 1;
}
}
$curr_tbl = int(++$curr_tbl); # increment and int the counter
return ($index, $curr_tbl, $tag_hash);
}
#
# clean up the descriptions (Item descr., Note descr., Part descr.)
#
sub cleanUpDescription {
my $description = shift;
$description =~ s/\d+\s*$//; # ends with a page number
$description =~ s/\.\.+//; # has dots in it
$description =~ s/^-//; # starts with a hypen
$description =~ s/^\s+|\s+$//g; # leading or trailing spaces
return $description;
}
#
# picks the (hopefully) correct tagged element
# returns 1 to keep the old tagged element (closer to the top of the doc)
# returns 0 to change to a new tagged element (closer to the end of the doc)
#
sub duplicateKeepOld {
my $array_ref = shift;
my $new_location = shift;
my $old_location = shift;
# the one with the most blank lines wins
my $old_blank_lines = 0;
my $new_blank_lines = 0;
# test the lines above and below for empty lines
$old_blank_lines++ if ($array_ref->[$old_location-1] =~ /^\s+$/);
$old_blank_lines++ if ($array_ref->[$old_location+1] =~ /^\s+$/);
$new_blank_lines++ if ($array_ref->[$new_location-1] =~ /^\s+$/);
$new_blank_lines++ if ($array_ref->[$new_location+1] =~ /^\s+$/);
if ($old_blank_lines > $new_blank_lines) {
return 1;
} elsif ($new_blank_lines > $old_blank_lines) {
return 0;
}
# test to see if the new one is a CONTINUE
return 1 if ($array_ref->[$new_location] =~ /CONTINUE/i);
return 1 if ($array_ref->[$new_location] =~ /Contd/);
# test the second line as well
return 1 if ($array_ref->[$new_location+1] =~ /CONTINUE/i);
return 1 if ($array_ref->[$new_location+1] =~ /Contd/);
# most leading spaces win
my $new_leading_spaces = 0;
if ($array_ref->[$new_location] =~ /^(\s+)\S/) {
$new_leading_spaces = length($1);
}
my $old_leading_spaces = 0;
if ($array_ref->[$old_location] =~ /^(\s+)\S/) {
$old_leading_spaces = length($1);
}
if ($old_leading_spaces > $new_leading_spaces) {
return 1;
} elsif ($new_leading_spaces > $old_leading_spaces) {
return 0;
}
# the shortest one wins
my $no_spaces = $array_ref->[$old_location];
$no_spaces =~ s/^\s+|\s+$//;
my $old_length = length($no_spaces);
$no_spaces = $array_ref->[$new_location];
$no_spaces =~ s/^\s+|\s+$//;
my $new_length = length($no_spaces);
if ($old_length < $new_length) {
return 1;
} elsif ($new_length < $old_length) {
return 0;
}
# the one with the most capitial letters win
my $old_num_cap = ($array_ref->[$old_location] =~ tr/[A-Z]/[A-Z]/);
my $new_num_cap = ($array_ref->[$new_location] =~ tr/[A-Z]/[A-Z]/);
if ($old_num_cap > $new_num_cap) {
return 1;
} elsif ($new_num_cap > $old_num_cap) {
return 0;
}
if ($debug > 5) {
print STDERR "duplicateKeepOld: Its a Tie! use the old one\n";
print STDERR "duplicateKeepOld old --> [$array_ref->[$old_location]]\n";
print STDERR "duplicateKeepOld new --> [$array_ref->[$new_location]]\n";
}
$logger::DEBUG->saveErrorsCurrent('-error' =>
"duplicateKeepOld: Its a Tie! use the old one\n".
"duplicateKeepOld old --> [$array_ref->[$old_location]]\n".
"duplicateKeepOld new --> [$array_ref->[$new_location]]\n");
return 1;
}
#
# splice tags from the tag_hash into the document_array
# add end tags to tags that needed the (do not end with />)
#
sub insertTags {
my $array_ref = shift;
my $hash_ref = shift;
my ($tag_id, $location, $tag, $index, $end_tag);
# go from the bottom to the top
foreach $tag_id (sort { $hash_ref->{$b}{'LOCATION'} <=>
$hash_ref->{$a}{'LOCATION'} }
keys %{$hash_ref}) {
$location = $hash_ref->{$tag_id}{'LOCATION'};
$tag = $hash_ref->{$tag_id}{'TAG'};
if ($hash_ref->{$tag_id}{'TAG'} =~ m@/>$@) {
splice(@{$array_ref}, $location, 0, $tag);
} else {
# make end tag
$tag_id =~ /^([^_]+)_/;
$end_tag = $1;
$end_tag = "";
splice(@{$array_ref}, $location+1, 0, $end_tag);
splice(@{$array_ref}, $location, 0, $tag);
}
}
}
#
# prints out what is stored in nested hash str
#
sub dumpHashDebug {
my $hash_ref = shift;
my $indent = shift;
my $key;
$indent = " " if !$indent;
foreach $key (keys %{$hash_ref}) {
print STDERR $indent, "$key <=> $hash_ref->{$key}\n";
if ($hash_ref->{$key} =~ /HASH/) {
&dumpHashDebug($hash_ref->{$key}, $indent . $indent);
}
}
print STDERR "\n";
}
#
# some Table of Contents have line like
# Item 1. This is Item One ....... 1
# Item 2. This is Item Two ....... 28
# the above lines at NOT the ITEM's we want to tag!
# the ITEM's we want to tag are
# Page 1.
# Item 1. This is Item One
#
# then some dribble about what Item one is and how great and import
# this Item is...
#
# We try and skip the ToC and tag the real ITEM's
# But while we are in the ToC grap some info, 'cause we can
#
sub readAndSkipToC {
my $index = shift;
my $array_ref = shift;
my $org_index = $index;
my (%toc_hash, $cnt, $line, $did_something, $part, $description);
my ($sequence, $last_action, $did_item, $check_item_cont);
while (++$index < $#{$array_ref}) {
$line = $array_ref->[$index];
if (!$line || ($line =~ /^\s+$/)) {
$did_item = 0;
next;
}
$did_something = 1;
# PART with description
if ($line =~ m@^\s*PART\s+([A-Z]|\d+|[IVX]+|F/S)[\.:]?\s+(.*)$@i) {
$part = $1;
$description = $2;
last if ($toc_hash{'PART'}{$part});
$description = &cleanUpDescription($description);
$toc_hash{'PART'}{$part} = $description;
# PART with no description
} elsif ($line =~ m@^\s*PART\s+([A-Z]|\d+|[IVX]+|F/S)\s*$@i) {
$part = $1;
last if ($toc_hash{'PART'}{$part});
$toc_hash{'PART'}{$part} = "NO_DESCRIPTION";
# ITEM
} elsif ($line =~ /^\s*ITEM\s+(\d+[A-Z]?)[\.:]?\s?(.*)$/i) {
$sequence = $1;
$description = $2;
last if ($toc_hash{'ITEM'}{$part}{$sequence});
$description = &cleanUpDescription($description);
$part = 1 if (!$part);
$toc_hash{'ITEM'}{$part}{$sequence} = $description;
$did_item = 1;
# see if this is a continued item
} elsif ($check_item_cont && ($line =~ /^\s+(.*)$/)) {
$description = $1;
$description = &cleanUpDescription($description);
$toc_hash{'ITEM'}{$part}{$sequence} .= " ".$description;
$did_item = 1; # mulit line cont?
# keep track if we did not do something
} else {
$did_something = 0;
$cnt++;
}
$check_item_cont = $did_item ? 1 : 0;
if ($did_something) {
# go back to here when we leave
$last_action = $index;
$cnt = 0;
} elsif ($cnt > 10) {
# quit
last;
}
}
$last_action++;
if ($debug > 5) {
print STDERR "** readAndSkipToC: Dump of the ToC hash\n";
&dumpHashDebug(\%toc_hash);
print STDERR "** readAndSkipToC: End of the ToC dump\n\n";
}
if (!%toc_hash) {
return ($org_index);
}
# need a way to pick the better ToC hash or merge the two hashes
# together. Example would be a break inside of a ToC
# NOTE at this time nothing is done with the %toc_hash
$logger::DEBUG->increment('-name' => "Table Of Contents");
return ($last_action);
}
sub version {
my $ver = $VERSION . "r" . $RELEASE;
if (__PACKAGE__ !~ /^main$/) {
$ver = __PACKAGE__ . " " . $ver;
}
return $ver;
}