# # file: edgar-table.pl # desc: parsed EDGAR table support # package EDGAR::Table; use strict; BEGIN { my $iw_root = ($ENV{IW}) ? $ENV{IW} : "/iw"; my $edgar_root = "$iw_root/skulker/edgar/current"; # where we find our local libraries my($libdir) = "$edgar_root/lib"; if ( -d $libdir) { unshift(@INC, $libdir); } # general edgar library support require 'edgar-lib.pl'; # generl edgar support require 'edgar-util.pl'; } # debug mode? my $debug = 0; ################### ## PARSED TABLES ## ################### # # parse EDGAR TABLE element # sub parse { my $data = shift; $logger::DEBUG->increment('-name' => "edgar-table"); # table data -- local copy my $table = $data; # parsed results my %res = (); # read the col tag line %{$res{'column'}} = &getStandardColsParse(\$table); return () if (!$res{'column'}{'offset'}); # pre-process table data &preParse(\$table, \%res); $logger::DEBUG->current('-name' => "parseTableData"); &parseTableData(\$table, \%res); $logger::DEBUG->current('-name' => "parseCaption"); &parseCaption(\%res); $logger::DEBUG->current('-name' => "parseHeader"); &parseHeader(\%res); %res; } # # extract table section based on provided start/end text # sub extract_section { my $data = shift; my $start_text = shift; my $end_text = shift; # extracted section data my $sect = ""; $$data =~ /(${start_text}.*?${end_text})/gs && ($sect = $1); if ($sect) { # remove extracted section from data $$data =~ s/\Q$sect\E\n?//gs; # remove start/end tags from result $sect =~ s/${start_text}|$end_text//gs; # eat leading/trailing blank lines $sect =~ s/^[\s\n]+|[\s\n]+$//gs; } # return extracted section text $sect; } # # in edgar document tables standard columns are # definded by the start of the column tags # generaly etc # the types are S = label everything else is data # sub getStandardColsParse { my $data_ref = shift; my $got_stdCols = 0; # make string into array my @array = split(/\n/, $$data_ref); my($i, @offset, @type, @width); for ($i=0; $i<=$#array; $i++) { next if (!$array[$i] || ($array[$i] =~ /^\s+$/)); if ($array[$i] =~ m@^\s*<[SC]/?>[\s]+$@) { $array[$i] =~ s@/@@g; while ($array[$i] =~ /<(?=(.))/g) { push @offset, pos($array[$i])-1; if ($1 eq "S") { push @type, "label"; } else { push @type, "data"; } } $got_stdCols = 1; last; } } if (!$got_stdCols) { print STDERR "getStandardColsParse: could not find std column\n" if ($debug > 1); $logger::DEBUG->saveErrorsCurrent('-error' => "could not find a standard column"); return (); } for $i (0 .. $#offset-1) { $width[$i] = $offset[$i+1] - $offset[$i] - 1; } $width[$#offset] = ""; return ('offset' => \@offset, 'width' => \@width, 'type' => \@type); } # # adds data to # $res->{'caption'} # $res->{'header'} # $res->{'footnote'} # $res->{'columns'} # sub preParse { my $data = shift; my $res = shift; my $footnote; # remove TABLE tags $$data =~ s@<\/?TABLE>@@gs; $res->{'caption'} = &extract_section($data, "", ""); $res->{'header'} = &extract_section($data, "", ""); # watch out for empty tags! # and NESTED tables! $$data =~ s@@@g; $$data =~ s@@@g; # other footnote formats possible foreach my $i (1..99) { $footnote = extract_section($data, "", ""); if ($footnote) { $res->{'footnote'} .= $footnote; $logger::DEBUG->increment('-name' => "footnote"); } } # extract table footer info, delimited by FN start and end text $footnote = extract_section($data, "", ""); while ($footnote) { $res->{'footnote'} .= $footnote; $logger::DEBUG->increment('-name' => "footnote"); $footnote = extract_section($data, "", ""); } # do we have pseudo nested tables? eat tags for now $$data =~ s@\n\n@\n@gs; } # # add info to # $res->{'table'}{'data'} # $res->{'table'}{'decription'} # sub parseTableData { my $data_ref = shift; my $res = shift; # make string into array my(@array) = split(/\n/, $$data_ref); my($i); for ($i=0; $i<=$#array; $i++) { next if ((!$array[$i]) || ($array[$i] =~ /^\s+$/) || ($array[$i] =~ m@^[\s]+$@)); my ($array_ref, $description) = &splitRows($i, \@array, $res->{'column'}{'offset'}, "body"); push (@{$res->{'table'}{'data'}}, $array_ref); push (@{$res->{'table'}{'description'}}, $description); } 0; } # # add info to # $res->{'table'}{'caption-data'} # $res->{'table'}{'caption-description'} # sub parseCaption { my $res = shift; return 0 if (!$res->{'caption'}); $logger::DEBUG->increment('-name' => "Caption"); $logger::DEBUG->currentLines('-value' => $res->{'caption'}); # make string into array my @array = split(/\n/, $res->{'caption'}); my $i; for ($i=0; $i<=$#array; $i++) { next if ((!$array[$i]) || ($array[$i] =~ /^\S+$/) || ($array[$i] =~ /^[-_= ]+$/)); my ($array_ref, $description) = &splitRows($i, \@array, $res->{'column'}{'offset'}, "header"); push (@{$res->{'table'}{'caption-data'}}, $array_ref); push (@{$res->{'table'}{'caption-description'}}, $description); } 0; } # # add info to # $res->{'table'}{'header-data'} # $res->{'table'}{'header-description'} # sub parseHeader { my $res = shift; return 0 if (!$res->{'header'}); $logger::DEBUG->increment('-name' => "Header"); $logger::DEBUG->currentLines('-value' => $res->{'header'}); $res->{'table'}{'header-data'} = $res->{'header'}; $res->{'table'}{'header-description'} = "0-" . $#{$res->{'column'}{'offsets'}}; 0; } sub missingColumnUseSpan { my $index = shift; my $array_ref = shift; my $partial_line_aref = shift; my $thisLineCols_aref = shift; my $stdCols_aref = shift; my($i, $j, $current_start); my(@colStarts, @colEnds); my(@full_line, @description); # look at next line for a clue (generaly headers do this) if (!$array_ref->[$index+1] || ($array_ref->[$index+1] =~ /^\s+$/) || ($array_ref->[$index+1] !~ /^[-_= ]+$/)) { return (0, 0); # might let this go through, later? } else { push @colStarts, 0 if ($array_ref->[$index+1] !~ /^\s\s/); while ($array_ref->[$index+1] =~ /\s( +)(?=[^ ])/g) { push @colStarts, pos($array_ref->[$index+1]); } return (0, 0) if ($#{$partial_line_aref} < $#colStarts); while ($array_ref->[$index+1] =~ /(\S+)(?=\s)/g) { push @colEnds, pos($array_ref->[$index+1]); } push @colEnds, length($array_ref->[$index+1]); } $current_start = 0; # compair the start and end position of the columns for ($i=0; $i<=$#colStarts; $i++) { for ($j=0; $j<=$#{$stdCols_aref}; $j++) { if (&EDGAR::util::close2equal($colStarts[$i], $stdCols_aref->[$j], 5)) { $full_line[$j] = $partial_line_aref->[$i]; $description[$j] = $j; $current_start = $j; } else { $full_line[$j] = "" if (!$full_line[$j]); $description[$j] = "" if (!$description[$j]); } if (($description[$current_start] !~ /-/) && ($colEnds[$i] < $stdCols_aref->[$j])) { $description[$current_start] .= "-" . ($j-1); } } } my($have_description) = 0; for ($i=0; $i<=$#description; $i++) { if (!$description[$i]) { if ($have_description) { $description[$i] = 0; } else { $description[$i] = $i ."-". $i; } } elsif ($description[$i] !~ /-/) { $have_description = 1; $description[$i] .= "-" . $#description; } } return (\@full_line, \@description); } sub missingColumnSingleSpan { my $line = shift; my $number2span = shift; my(@full_line, @description, $i); $line =~ s/^\s+//; $full_line[0] = $line; $description[0] = "0-".$number2span; for $i (1..$number2span) { $full_line[$i] = ""; $description[$i] = 0; } return (\@full_line, \@description); } sub missingColumnUseEmpty { my $index = shift; my $array_ref = shift; my $partial_line_aref = shift; my $thisLineCols_aref = shift; my $stdCols_aref = shift; my($i, $j, $distance, $min, @full_line, $offset); for ($i = 0; $i <= $#{$stdCols_aref}; $i++) { $full_line[$i] = ""; # pre-fill the line } $min = 9999; for ($i=0; $i<=$#{$thisLineCols_aref}; $i++) { for ($j=0; $j<=$#{$stdCols_aref}; $j++) { $distance = abs($thisLineCols_aref->[$i] - $stdCols_aref->[$j]); if ($distance < $min) { $min = $distance; $offset = $j; } } $min = 9999; if (!$full_line[$offset]) { $full_line[$offset] = $partial_line_aref->[$i]; } else { $full_line[$offset] .= " ".$partial_line_aref->[$i]; } } return @full_line; } sub fixMissingColumns { my $index = shift; my $array_ref = shift; my $thisLineCols_aref = shift; my $stdCols_aref = shift; my $split_what = shift; my($full_line_aref, $desc_aref); if (!$#{$thisLineCols_aref} && ($split_what eq "header")) { ($full_line_aref, $desc_aref) = &missingColumnSingleSpan($array_ref->[$index], $#{$stdCols_aref}); return ($full_line_aref, $desc_aref); } $array_ref->[$index] =~ s/^\s+//; my @partial_line; push @partial_line, 0 if ($array_ref->[$index] =~ /^[^ ]/); @partial_line = split(/ +/, $array_ref->[$index]); if ($split_what eq "header") { ($full_line_aref, $desc_aref) = &missingColumnUseSpan($index, $array_ref,\@partial_line, $thisLineCols_aref, $stdCols_aref); return ($full_line_aref, $desc_aref) if ($full_line_aref); } my (@full_line) = &missingColumnUseEmpty($index, $array_ref, \@partial_line, $thisLineCols_aref, $stdCols_aref); my (@tmp_array) = &descriptionFull($stdCols_aref); return (\@full_line, \@tmp_array); } sub fixExtraColumns { my $index = shift; my $array_ref = shift; my $thisLineCols_aref = shift; my $stdCols_aref = shift; my $split_what = shift; my ($line, $i, @colStarts, @new_array, $element); # get rid of tabs $array_ref->[$index] =~ s/\t/ /g; my (@array) = split(/ {2,}/, $array_ref->[$index]); # fix the easy one, a blank first element shift @array if ($array[0] eq ""); return \@array if ($#array == $#{$stdCols_aref}); # add a header only fix HERE! (At Some Future Time) ASFT # if stand alone "$" (or single char) move it to next element for ($i=0; $i<$#array; $i++) { if ($array[$i] =~ /^(.)$/) { $array[$i+1] = $1 ." ". $array[$i+1]; $array[$i] = ""; } } # remove blank elements in the array; foreach $element (@array) { push (@new_array, $element) if ($element); } return \@new_array if ($#new_array == $#{$stdCols_aref}); my $new_line = join(" ", @new_array); # 7 spaces # fix ones where there are extra spaces in a column for ($i=3; $i<6; $i++) { @array = split(/ {$i,}/, $new_line); if ($#array == $#{$stdCols_aref}) { return \@array; } elsif ($#array < $#{$stdCols_aref}) { # gone to far, go back one and quit @array = split(/ {$i-1,}/, $array_ref->[$index]); last; } } if ($#array == $#{$stdCols_aref}) { return \@array; # can't get here, but? } elsif ($#new_array < $#{$stdCols_aref}) { $array_ref->[$index] = join(" ", @new_array); push @colStarts, 0 if ($array_ref->[$index] !~ /^\s\s/); while ($array_ref->[$index] =~ /\s( +)(?=[^ ])/g) { push @colStarts, pos($array_ref->[$index]); } my ($array_ref, $desc_aref) = &fixMissingColumns($index, $array_ref, \@colStarts, $stdCols_aref, $split_what); return ($array_ref, $desc_aref); } else { print STDERR "fixExtraColumns: unable to fix extra columns\n" if ($debug > 1); $logger::DEBUG->saveErrorsCurrent('-error' => "unable to fix extra columns"); return \@array; } } sub dotsToSpaces { my $change_string = shift; $change_string =~ tr/./ /; return $change_string; } sub splitRows { my $index = shift; my $array_ref = shift; my $stdCols_aref = shift; my $split_what = shift; my(@colStarts, $i, @array, $fixed_aref, $desc_aref, @description); # get rid of dots (...) $array_ref->[$index] =~ s/(\.\.+)/&dotsToSpaces($1)/eg; push @colStarts, 0 if ($array_ref->[$index] !~ /^\s\s/); while ($array_ref->[$index] =~ /\s( +)(?=[^ ])/g) { push @colStarts, pos($array_ref->[$index]); } $logger::DEBUG->currentIncrement('-name' => "total rows"); if ($#colStarts == $#{$stdCols_aref}) { # same number of columns, good, we are done $logger::DEBUG->currentIncrement('-name' => "good rows"); $array_ref->[$index] =~ s/^\s+//; @array = split(/ +/, $array_ref->[$index]); @description = &descriptionFull($stdCols_aref); return (\@array, \@description); } elsif ($#colStarts < $#{$stdCols_aref}) { # fewer, find the missing column(s) $logger::DEBUG->currentIncrement('-name' => "missing rows"); ($fixed_aref, $desc_aref) = &fixMissingColumns($index, $array_ref, \@colStarts, $stdCols_aref, $split_what); return ($fixed_aref, $desc_aref); } elsif ($#colStarts > $#{$stdCols_aref}) { # extra, deal with extra column(s) $logger::DEBUG->currentIncrement('-name' => "extra rows"); ($fixed_aref, $desc_aref) = &fixExtraColumns($index, $array_ref, \@colStarts, $stdCols_aref, $split_what); if (!$desc_aref) { @description = &descriptionFull($stdCols_aref); $desc_aref = \@description; } return ($fixed_aref, $desc_aref); } } # # table column description # an array of start and stop values seperated by "-" # Ex. 0-0 means this column starts on column 0 ends on column 0, normal # 2-3 starts on column 2 ends on column 3, a column span of two columns sub descriptionFull { my $col_ref = shift; my($i, @array); for ($i=0; $i<=$#{$col_ref}; $i++) { push (@array, "$i-$i"); } return @array; } # Added by gnarayan # Try to obtain table type from the data sub getTableType { my($section) = shift; my($table) = shift; my($tableType) = ""; my($i, $j, $arr); # Set of phrases that indicate an income statement my(@incomeStat) = ('Net income','net profits','net earnings','revenue', 'sales','net sales','per share','cost of goods sold', 'cost of products sold','operating income','gross profit', 'pretax income','pre-tax income','income before taxes', 'income before income taxes','interest expense', 'interest income','general and administrative expense', 'income taxes','revenues'); # Set of phrases that indicate a cash flow statement my(@cashFlow) = ('Cash','interest paid','income taxes paid','assets acquired', 'proceeds from sales of assets', 'deferred income taxes', 'depreciation and amortization', 'change in accounts payable', 'change in accounts receivable', 'capital expenditures', 'cash payments'); # Set of phrases that indicate a balance sheet my(@balSheet) = ('Assets', 'current assets', 'other assets', 'liabilities', 'short-term debt', 'common stock', 'preferred', 'equity', 'shareholder equity', 'prepaid expenses', 'property', 'plant and equipment'); my($data) = ""; # First go thorugh caption and see if there is a match for $i (0 .. $#{$table->{'table'}{$section}}) { for $j (0 .. $#{$table->{'table'}{$section}[$i]}) { $data = $table->{'table'}{$section}[$i][$j]; if($tableType =~ /\w+/) { last; } # Compare with income statement foreach $arr(@incomeStat) { if($data =~ /$arr/si) { $tableType = "Income Statement"; last; } } # Compare with cash flow statement if($tableType !~ /\w+/) { foreach $arr(@cashFlow) { if($data =~ /$arr/si) { $tableType = "Cash Flow Statement"; last; } } } # Compare with balance sheet if($tableType !~ /\w+/) { foreach $arr(@balSheet) { if($data =~ /$arr/si) { $tableType = "Balance Sheet"; last; } } } } if($tableType =~ /\w+/) { last; } } return $tableType; } sub toXML { my($parsed) = shift; my($doc) = shift; my($seq) = shift; my($desc) = shift; my($i, $j, $start, $end, $table, $type); # Added by gnarayan my($tableType) = ""; # Try to obtain table type $tableType = &getTableType('caption-data', $parsed); if($tableType !~ /\w+/) { $tableType = &getTableType('data', $parsed); } $desc = $tableType; $logger::DEBUG->increment('-name' => "toXML_table"); $logger::DEBUG->saveCurrentInfo('-type' => "table_type", '-info' => $tableType); # table attributes - id, document #, and sequence # REQUIRED return "" unless ($doc && $seq); $table = $parsed->{'table'}; return "" unless ($table); # unique table identifier my($id) = "document_" . $doc . "_table_" . $seq; # start of parsed table my($xml) = "{'caption-data'}) { $xml .= &toXMLcaptionData($parsed); } # table header -- non-caption/footer/row data, if any (OPTIONAL) if ($table->{'header-data'}) { $xml .= "\n"; $xml .= EDGAR::XML::encode($table->{'header-data'}) . "\n"; $xml .= "\n"; } # table body (REQUIRED) $xml .= "" . "\n" if (!$table->{'caption-data'}); # table row data for $i (0 .. $#{$table->{'data'}}) { $xml .= " " . "\n"; for $j (0 .. $#{$table->{'data'}[$i]}) { $xml .= " {'data'}[$i][$j] =~ /^[-_ ]+$/) { $type = "\"srule\""; } elsif ($table->{'data'}[$i][$j] =~ /^[= ]+$/) { $type = "\"drule\""; } else { $type = &EDGAR::util::encode_av( $parsed->{'column'}{'type'}[$j] ); } $xml .= " type=$type"; $xml .= " offset=". &EDGAR::util::encode_av( $parsed->{'column'}{'offset'}[$j] ); $xml .= " width=" . &EDGAR::util::encode_av( $parsed->{'column'}{'width'}[$j] ) if $parsed->{'column'}{'width'}[$j]; if ($table->{'description'}[$i][$j] =~ /-/) { ($start, $end) = split(/-/, $table->{'description'}[$i][$j]); } else { $start = $end = 0; } if ($start != $end) { # note there are no colspans in body at this time my $answer = $end - $start + 1; $xml .= " colspan=". &EDGAR::util::encode_av($answer); } $xml .= ">"; if ($table->{'data'}[$i][$j]) { if ($type !~ /[sd]rule/) { $xml .= &EDGAR::XML::encode( $table->{'data'}[$i][$j] ); } else { $xml .= " "; } $xml .= "\n"; } else { $xml .= " \n"; } } $xml .= " " . "\n"; } # end of table body $xml .= "" . "\n"; # table footer -- footnotes (OPTIONAL) if ($parsed->{'footnote'}) { $xml .= "" . "\n"; $parsed->{'footnote'} =~ s@@@gs; $xml .= EDGAR::XML::encode($parsed->{'footnote'}) . "\n"; $xml .= "" . "\n"; $logger::DEBUG->increment('-name' => "footnote"); } # end parsed table $xml .= "" . "\n"; return $xml; } # # this is now header info # there is no more caption data (as of this time) # sub toXMLcaptionData { my $parsed = shift; my ($i, $j, $xml, $start, $end, $colspan, $type); $xml = "" . "\n"; for $i (0 .. $#{$parsed->{'table'}{'caption-data'}}) { $xml .= " " . "\n"; for $j (0 .. $#{$parsed->{'table'}{'caption-data'}[$i]}) { if ($colspan) { $colspan--; next; } if ($parsed->{'table'}{'caption-description'}[$i][$j] =~ /-/) { ($start, $end) = split(/-/, $parsed->{'table'}{'caption-description'}[$i][$j]); $colspan = $end - $start; } else { $colspan = 0; } if ($parsed->{'table'}{'caption-data'}[$i][$j] =~ /^[-_ ]+$/) { $type = "\"srule\""; } elsif ($parsed->{'table'}{'caption-data'}[$i][$j] =~ /^[= ]+$/) { $type = "\"drule\""; } else { $type = "\"header\""; } $xml .= " {'column'}{'offset'}[$j]); $xml .= " width=" . &EDGAR::util::encode_av($parsed->{'column'}{'width'}[$j]) if $parsed->{'column'}{'width'}[$j]; if ($colspan) { $xml .= " colspan=" . &EDGAR::util::encode_av($colspan+1); } $xml .= ">"; if ($parsed->{'table'}{'caption-data'}[$i][$j]) { if ($type !~ /[sd]rule/) { $xml .= &EDGAR::XML::encode($parsed->{'table'}{'caption-data'}[$i][$j]); } else { $xml .= " "; } $xml .= "\n"; } else { $xml .= " \n"; } } $xml .= " " . "\n"; } $xml .= "\n"; return $xml; } # # dump parsed table data # sub dumpHtmlTables { my($res) = shift; my($i, $j, $start, $end); print "\n"; # output the legend if ($res->{'table'}{'header-data'}) { ($start, $end) = split(/-/, $res->{'table'}{'header-description'}); print "\n"; print "\n"; } # output the caption print "\n"; for ($i = 0; $i <= $#{$res->{'table'}{'caption-data'}}; $i++) { for ($j = 0; $j <= $#{$res->{'table'}{'caption-data'}[$i]}; $j++) { next if (!$res->{'table'}{'caption-description'}[$i][$j]); ($start, $end) = split(/-/, $res->{'table'}{'caption-description'}[$i][$j]); if ($start == $end) { print " \n"; } print "\n"; } # output the data for ($i = 0; $i <= $#{$res->{'table'}{'data'}}; $i++) { for ($j = 0; $j <= $#{$res->{'table'}{'data'}[$i]}; $j++) { next if (!$res->{'table'}{'description'}[$i][$j]); ($start, $end) = split(/-/, $res->{'table'}{'description'}[$i][$j]); if ($start == $end) { print " \n"; } print "\n"; } print "
"; print $res->{'table'}{'header-data'}, "
"; } else { print " "; } print $res->{'table'}{'caption-data'}[$i][$j], "
"; } else { print " "; } print $res->{'table'}{'data'}[$i][$j], "
\n"; } ################# ## TEXT TABLES ## ################# # # return an array of the column start positions for a text table row # return () is row is "bad" row # sub getTextTableColumns { my $line = shift; my @col_array; if (!$line || ($line =~ /^\s+$/) || ($line =~ /^[-_= ]+---|___|===[-_= ]+$/)) { return (); } push @col_array, 0 if ($line !~ /^\s\s/); while ($line =~ /\s( +)(?=[^ ])/g) { push @col_array, pos($line); } return @col_array; } # # if in a table # return the start position for the columns in this row # if not in table # return 0 # sub ifTextTable { my($line_ptr) = shift; my($text_aref) = shift; my($column2toEnd, $column, $done, $prevLine); my($headerLeadingSpaces, $columnLeadingSpaces, $cnt); my $curr_line = $text_aref->[$line_ptr]; my $next_line = $text_aref->[$line_ptr+1]; return () if !$next_line; # tables have minium of |some text| |some more text| return () if ($curr_line !~ /^.+?\S\s{3,}\S/); return () if ($next_line !~ /^.+?\S\s{3,}\S/); my @curr_line_cols = &getTextTableColumns($text_aref->[$line_ptr]); my @next_line_cols = &getTextTableColumns($text_aref->[$line_ptr+1]); return () if ($#curr_line_cols != $#next_line_cols); # if the cols are close to lining up, call it a table my $error_cnt = 0; for (my $i=0; $i<=$#curr_line_cols; $i++) { if (&EDGAR::util::close2equal($curr_line_cols[$i], $next_line_cols[$i], 2)) { $error_cnt--; } else { $error_cnt++; } } if ($error_cnt < 0) { return @curr_line_cols; } else { return (); } } sub inTableRow { my $std_cols_ref = shift; my $test_cols_ref = shift; my ($more_cols, $fewer_cols); if ($#{$std_cols_ref} > $#{$test_cols_ref}) { $more_cols = $std_cols_ref; $fewer_cols = $test_cols_ref; } else { $more_cols = $test_cols_ref; $fewer_cols = $std_cols_ref; } my ($exact, $no_match, $close, $i); $exact = $no_match = $close = 0; for ($i=0; $i<=$#{$more_cols}; $i++) { if (defined $fewer_cols->[$i] && ($more_cols->[$i] == $fewer_cols->[$i])) { $exact++; } elsif ($fewer_cols->[$i+1] && ($more_cols->[$i] == $fewer_cols->[$i+1])) { $exact++; $no_match++; } elsif (defined $fewer_cols->[$i] && &EDGAR::util::close2equal($more_cols->[$i], $fewer_cols->[$i], 3)) { $close++; } elsif ($fewer_cols->[$i+1] && &EDGAR::util::close2equal($more_cols->[$i], $fewer_cols->[$i+1], 3)) { $close++; $no_match++; } else { $no_match++; } } if ($no_match >= ($exact+$close)) { return 0; # not a table row } elsif ($close || $no_match) { return 1; # a table row, a table problem } else { return 1; # a table row, not a table problem } } # # goes upwards looking for the start of a table # # things I know # 1) I'm in a table # 2) I know the standard columns start point # sub findStartOfTable { my $index = shift; my $text_aref = shift; my $std_col_aref = shift; my ($line, @col_start, $table_row, $table_prob); while (--$index > 0) { $line = $text_aref->[$index]; # out if a black line return $index+1 if (!$line || ($line =~ /^\s+$/)); next if ($line =~ /^[-_= ]+---|___|===[-_= ]+$/); # out if not a table row return $index+1 if ($line !~ /^.+?\S\s{3,}\S/); # still in table check, very liberal @col_start = &getTextTableColumns($line); ($table_row, $table_prob) = &inTableRow($std_col_aref, \@col_start); if ($table_row) { next; } else { # out of the table return ($index+1); } } } sub textTable2HTML { my $text_ref = shift; my $std_col_aref = shift; my $html = "\n"; my ($i, $table_row_ref, $description, $element); for ($i=0; $i<=$#{$text_ref}; $i++) { # skip the line is it only [-_=] next if ($text_ref->[$i] =~ /^[\-_= ]+$/); # change tags into spaces (4 spaces) $text_ref->[$i] =~ s/\t/ /g; ($table_row_ref, $description) = &splitRows($i, $text_ref, $std_col_aref, "body"); $html .= "\n"; foreach $element (@{$table_row_ref}) { $element =~ s/^\s+|\s$//g; $html .= &addRowTags($element); } $html .= "\n"; } $html .= "
"; return $html; } sub addRowTags{ my($text) = shift; if ($text =~ /^[\%\$\d,\.\(\) ]+$/) { $text = " $text\n"; } else { $text = " $text\n"; } $text; } # keep require happy 1; =head1 NAME edgar-table.pl - EDGAR table routines =head1 PACKAGE EDGAR::Table =head1 SYNOPSIS require edgar-table.pl; =head1 REQUIRES Perl, version 5.001 or higher. =head1 DESCRIPTION EDGAR table routines. =over 3 =head1 METHODS =head2 parse =item * parses an EDGAR table element. =item * A hash array of parsed table data is returned. =item example: my (%data) = &EDGAR::Table::parse($tableElement); =head2 preParse =item * pre-parse table data to extract header and footer data. The standard column width and types are found =item * the table data is modified to extract header and footer data. =item example: &EDGAR::Table::preParse(\$tableElement, \%results); =head2 parseTableData =item * parse and extract table the data in the table block =item * row data is parsed and extracted. =item example: &EDGAR::Table::parseTableData(\$tableElement, \%results); =head2 extract_section =item * extracts table section based on provided start and end text =item * the data within the begin and end tags is returned. =item example: my($sect) = &EDGAR::Table::extract_section("this is", "", ""); =head2 toXML =item * generate XML structure from table data =item * this routine the table data in XML format =item example: my(@row) = &EDGAR::Table::toXML(\%parsed, $doc, $seq, $desc); =head2 dump =item * dump parsed table data - used for debugging =item example: &EDGAR::Table::dump(\%parsed); =back =head1 COPYRIGHT Copyright 1999 Invisible Worlds. =cut