# # parsing functions for the various EDGAR document types # # we want perl 5.00x or later require 5.004; # limit scope package EDGARdoc::util; require "edgar-date.pl"; # sub to find number of columns in a row sub findNoCols { local($cols) = @_; local($numcol, $rest, $match); $numcol =1; while($cols) { ($match, $rest) = ($cols =~ /.*?()(.*)/); $numcol++; $cols = $rest; } return $numcol; } # fix data for munging sub fixText { my($text) = shift; $text =~ s@@@g; $text =~ s@@@g; $text =~ s@@@g; $text =~ s@@@g; # remove tags $text =~ s###gs; # remove =============== $text =~ s/={3,}//gs; # get rid of "(Month/day/year)" items $text =~ s@\(month/day/year\)@@gis; $text =~ s@\(month/date/year\)@@gis; $text =~ s@\(mon/day/year\)@@gis; $text =~ s@\(month/year\)@@gis; # fix anomoly in table data $text =~ s/5 Relationship of Reporting/5. Relationship of Reporting/; # get rid of "(voluntary)" items $text =~ s@\(voluntary\)@@gis; # get rid of unnecessary labels $text =~ s/(.*)\(last\).*\(first\).*(\(middle\)|\(mi\))(.*)/$1 $3/i; $text =~ s/\(street\)//i; $text =~ s/(.*)\(city\).*\(state\).*\(zip\)(.*)/$1$2/i; $text; } # # Subroutine to get initial question information for forms 3,4,5 # sub getFormFooterInfo { my($text) = shift; my($signature, $date); if ($text =~ /SIGNATURE OF REPORTING PERSON/si) { ($signature, $date) = $text =~ /SIGNATURE OF REPORTING PERSON(.*?)DATE(.*)/sig; $signature =~ s/\/s\///i; if ($signature !~ /\w/) { ($signature, $date) = $text =~ /.*\/s\/\s*?(.*?)\s{3,}(.*?)\n/i; } } else { ($signature, $date) = $text =~ /.*\/s\/\s*?(.*?)\s{3,}(.*?)\n/i; } return($signature, $date); } # # Subroutine to get initial question information for forms 3,4,5 # sub getFormInitialQuestionsInfo { my($text) = shift; my($firstName, $middleName, $lastName, $address, $city, $state, $zipCode, $issuerName, $issuerTicker, $irsId, $eventMonth, $eventDay, $eventYear, $amendMonth, $amendDay, $amendYear, $director, $owner, $officer, $other, $relationAddendum, $indFiling, $jointFiling); my($dateInfo, $issuerRelInfo, $filingType, $location, $addressInfo, $issuerData); # pull data out of ascii tables; "|" delimit cells if ($text =~ m/^\|/) { $text = &getAsciiTableData($text); if ($debug > 10) { print "data brought back from getAsciiTableData ---------------\n"; print "$text\n"; print "--------------------\n"; } } # remove unnecessary "=" and "-" and "_" $text =~ s/={3,}//gs; $text =~ s/_{3,}//gs; $text =~ s/-{3,}//gs; # get rid of blank lines $text =~ s/\n\s*\n/\n/gs; # date of event # fix verbiage where "requiring" is spelled "re- quiring" $text =~ s/re-\s+quiring/Requiring/gis; ($dateInfo) = $text =~ /Date of Event Requiring Statement(.*?)(\d\.)/sig; ($eventMonth, $eventDay, $eventYear) = &getDateParts($dateInfo); # amendment date ($dateInfo) = $text =~ /If Amendment, Date of Original(.*?)(\d\.)/sig; # in case it's the last element in the header $dateInfo =~ s/(.*).*/$1/si; # remove "not applicable" and "N/A" replies $dateInfo =~ s@not applicable@@sig; $dateInfo =~ s@n/a@@sig; ($amendMonth, $amendDay, $amendYear) = &getDateParts($dateInfo); # IRS ID or social security # of reporting person $text =~ s/Se-\s+curity/Security/gis; $text =~ s/I.R.S./IRS/gis; $text =~ s/Soc. Sec. No./Social Security Number/gis; ($irsId) = $text =~ /IRS.*Number of Reporting Person.*?(\d{3}-\d{2}-\d{4}).*(\d\.).*/sig; # in case it's the last element in the header $irsId =~ s/(.*)
.*/$1/si; # remove colons and spaces $irsId =~ s/:|\s+//sig; # relationship of reporting person to issuer $text =~ s/Issurer/Issuer/gis; ($issuerRelInfo) = $text =~ /Relationship of Reporting Person.*?to\s*?Issuer(.*?)(\d\.).*/sig; # get rid of "(Check all applicable)" $issuerRelInfo =~ s/\(Check all applicable\)//sig; # in case it's the last element in the header $issuerRelInfo =~ s/(.*)
.*/$1/si; $director = &isChecked("Director", $issuerRelInfo); $owner = &isChecked("10% Owner", $issuerRelInfo); $officer = &isChecked("Officer", $issuerRelInfo); $other = &isChecked("Other", $issuerRelInfo); # title of reporting person - if it exists ($misc, $relationAddendum) = $issuerRelInfo =~ /.*?(\(specify.*below\)|title:)(.*)/sig; # filing type ($filingType) = $text =~ /Individual or Joint.*?Group.*?ing(.*?)(\d\.).*/sig; $filingType =~ s/\s+/ /sig; # get rid of "(Check Applicable Line)" $filingType =~ s/\(Check applicable line\)//sig; # in case it's the last element in the header $filingType =~ s/(.*)
.*/$1/si; $jointFiling = &isChecked("Form filed by More than One Reporting Person", $filingType); $indFiling = &isChecked("Form filed by One Reporting Person", $filingType); # issuer name and ticker symbol ($issuerData) = $text =~ /Issuer\s+Name\s+and\s+Ticker\s+or\s+Trading\s+Symbol(.*?)(\d\.).*/sig; # in case it's the last element in the header $issuerData =~ s/(.*)
.*/$1/si; # remove white space $issuerData =~ s@\s+|/@ @g; # remove parenthesis, quotes, and colons $issuerData =~ s/\(|\)|"|:|;//g; # remove naming convention anomolies $issuerData =~ s/(nasdaq)|(nyse)|(OTC-Bulletin Board Symbol =)//ig; $issuerData = EDGAR::XML::trim($issuerData); ($issuerName, $issuerTicker) = $issuerData =~ /(.*)\s+(\w+)$/; # address info $text =~ s/Person\(s\)/Person/g; ($addressInfo) = $text =~ /Name and Address of Reporting Person(.*?)(\d\.).*/sig; # in case it's the last element in the header $addressInfo =~ s/(.*)
.*/$1/si; # remove beginning and ending spaces $addressInfo =~ s/^(\*|:)//g; $addressInfo =~ s/^\s+//sg; $addressInfo =~ s/\s+$//sg; ($name, $address, $location) = $addressInfo =~ /^(.+?)\n(.+)\n(.+)$/sg; $name =~ s/\s+$//g; # if there's a comma in the name, switch the first and last name if ($name =~ /\.$/) { $name =~ s/,//; ($lastName, $firstName, $middleName) = $name =~ /(\w+)\s+(\w+)\s+(\w*)/; } elsif ($name =~ /,/) { ($lastName, $firstName, $middleName) = $name =~ /(\w+)\,\s+(\w+)|((\w+)\s+(\w*))/; } else { $name =~ s/,|\.//; ($firstName, $middleName, $lastName) = $name =~ /(\w+)\s+(\w*?)\s*(\w+)$/; if ($debug > 5) { print "NAME INFO:::$name\n"; print "FIRST:::$firstName\n"; print "LAST:::$lastName\n"; print "MI:::$middleName\n\n"; } } if ($location =~ /,/) { $location =~ s/,/ /; } $location =~ s/^\s+//g; $location =~ s/\s+$//g; ($city, $state, $zipCode) = $location =~ /(.+?)\s{2,}(.+)\s+(\d{5}|\d{5}-\d{4})$/s; $state =~ s/\s+/ /g; $address =~ s/\s+/ /g; if ($debug > 5) { print "subject to section 16: $subjectToSection16\n"; print "IRS ID: $irsId\n"; print "DIRECTOR: $director; OWNER: $owner; OFFICER: $officer; OTHER: $other\n"; print "relationship addendum: $relationAddendum\n"; print "filing type info: $filingType\n"; print "JOINT: $jointFiling\n"; print "INDIVIDUAL: $indFiling\n"; print "ISSUER DATA: $issuerData\n"; print "issuer: $issuerName\n"; print "ticker: $issuerTicker\n"; print "first: $firstName\n"; print "middle: $middleName\n"; print "last: $lastName\n"; print "address: $address\n"; print "city: $city\n"; print "state: $state\n"; print "ZIP: $zipCode\n"; } return($firstName, $middleName, $lastName, $address, $city, $state, $zipCode, $issuerName, $issuerTicker, $irsId, $eventMonth, $eventDay, $eventYear, $amendMonth, $amendDay, $amendYear, $director, $owner, $officer, $other, $relationAddendum, $indFiling, $jointFiling); } # # Subroutine to get header information # sub getFormHeaderInfo { my($text) = shift; my($notSubjectToSection16, $form3HoldingsReported, $form4TransactionsReported); # get whether it's no longer subject to section 16 $notSubjectToSection16 = &isChecked("Check this box if", $text); if (! $subjectToSection16) { $notSubjectToSection16 = &isChecked("no longer Subject", $text); } # get whether form 3 holdings are reported $form3HoldingsReported = &isChecked("Form 3 Holdings Rep", $text); # get whether form 3 holdings are reported $form4TransactionsReported = &isChecked("Form 4 Trans", $text); return($notSubjectToSection16, $form3HoldingsReported, $form4TransactionsReported); } # # Subroutine to get a data element from an ASCII table # sub getAsciiTableData { my($text) = shift; my($headerData, $whatsLeft, $numLines, $currentPlacement, $headerOut, $elementCnt, $lineCnt); my(@lines, @elementArr, @elementArr2, @placementArr, @data); # grab stuff we want if ($text =~ /TABLE I/) { ($headerData, $whatsLeft) = $text =~ /(^\|.*?)(TABLE I.*)/s; } else { $headerData = $text; } (@lines) = split(/\n/, $headerData); $numLines = $#lines + 1; for ($lineCnt = 0; $lineCnt < $numLines; $lineCnt++) { $lines[$lineCnt] =~ s/^\|//; $lines[$lineCnt] =~ s/\|$//; } $currentPlacement = 0; for ($lineCnt = 0; $lineCnt < $numLines; $lineCnt++) { @elementArr = (); @elementArr2 = (); @elementArr = split(/\|/, $lines[$lineCnt]); @elementArr2 = split(/\|/, $lines[$lineCnt + 1]); $numElements = $#elementArr + 1; for ($elementCnt = 0; $elementCnt < $numElements; $elementCnt++) { if (defined($placementArr[$elementCnt])) { if ($elementArr[$elementCnt] =~ /^-*-$/) { if ($elementArr2[$elementCnt] =~ /\s*\d+\./) { $placementArr[$elementCnt] = $currentPlacement; $currentPlacement++; } # we don't need this string data, null it out $elementArr[$elementCnt] = ""; } } else { $placementArr[$elementCnt] = $currentPlacement; $currentPlacement++; } $data[$placementArr[$elementCnt]] .= "\n" . $elementArr[$elementCnt]; } } $headerOut = ""; foreach $dataItem (@data) { $headerOut .= $dataItem . "\n"; } $text = $headerOut . "\n" . $whatsLeft; $text; } # # Subroutine to get a data element from a parsed table # sub getTableData { local($caption, $rowno, $colno, @data) = @_; local($row, $col, $caprow, $capcol); $caprow = -1; $capcol = -1; for($row=0;$row<$rowno;$row++) { for($col=0;$col<$colno;$col++) { if($data[$row][$col] =~ /$caption/i) { $caprow = $row; $capcol = $col; last; } } if($caprow ne -1) { last; } } return $data[$caprow+1][$capcol]; } # Sub to replace caption tags with tags sub removeCaptionTag { local($table) = @_; local(@rows) = split(/.*/, $table); local($tabletags, $rowcount); $rowcount = 0; foreach $row(@rows) { ($tabletags) = ($table =~ /(.*)/); $table =~ s/$tabletags//; $rows[$rowcount] =~ s/
/$tabletags\n/; $rows[$rowcount] =~ s/<\/CAPTION>/\n-------------------------------------------------/; $rowcount++; } $table=""; foreach $row(@rows) { $table .= $row; $table .= "\n"; } return $table; } # Generic sub to parse tables sub parseTable { local($table1) = @_; local(@data, @rows, @elements, @elementArr, @ele, @collen); local($tabletemp, $row, $cols, $numcol, $start, $colno, $rowdata, $element, $el, $match, $rest); local($maxcol, $elementCnt); $maxcol = 0; # If table has a CAPTION tag, it has to be fixed if($table1 =~ /CAPTION/) { $table1 = removeCaptionTag($table1); } # remove superfluous TABLE tags $table1 =~ s/\//sig; $table1 =~ s/\<\/TABLE\>//sig; # remove document.table tags $table1 =~ s###sig; $table1 =~ s###sig; # Separate table into rows separated by ... tags @rows = split(/.*/, $table1); # For each row, we need to extract data shift(@rows); $tabletemp = $table1; $rowdata = 0; foreach $row (@rows) { # Get the sequence for this row ($cols) = ($tabletemp =~ /(.*)/); $tabletemp =~ s/$cols//; # Get column widths $numcol = findNoCols($cols); if($numcol gt $maxcol) { $maxcol = $numcol; } $start = "@@g; # remove spaces at beginning and end $dateStr = EDGAR::XML::trim($dateStr); # remove extra white space in date info $dateStr =~ s@\s{2,}@ @g; if ($dateStr =~ /\//) { # month/day/year format ($month, $day, $year) = split(/\//, $dateStr); # in some cases, no day is given if (! $year) { $year = $day; $day = ""; } } elsif ($dateStr =~ /,/) { # month day, year format ($month, $day, $year) = $dateStr =~ /(.*?) (\d{1,2}), (\d{2,4})/; } else { # spaces between data items ($month, $day, $year) = split(/\s+/, $dateStr); } # month should be written out, so if a number make the change if ($month =~ /\d+/) { $month = $EDGAR::date::Months[$month]; } # remove anything other than numbers from year and day $year =~ s/\D//g; $day =~ s/\D//g; # pad digits to years, giving a 4 number year if (length($year) == 2) { if ($year > 70) { $year = "19" . $year; } else { $year = "20" . $year; } } elsif ((length($year) == 6) || (length($year) == 5)) { ($day, $year) = $year =~ /(\d{1,2})(\d{4})/; } # pad a zero to the front, if the day is a single number if (length($day) == 1) { $day = "0" . $day; } return ($month, $day, $year); } # # format a value to a number # sub toNumber { my($number) = shift; # keep only digits and decimal $number =~ s/\(.*\)//g; # keep only digits and decimal $number =~ s/[\sA-Za-z_\-\,\$\*]//g; $number; } # # create beneficial_1 XML string # sub getBeneficial1StartXML { my($aMonth, $aYear, $notSubjectToSection16, $form3HoldingsReported, $form4TransactionsReported) = @_; my($outStr) = ""; $outStr .= "\n"; $ofStr; } # # create name XML string # sub getNameXML { my($nameType, $first, $middle, $last) = @_; my($nameStr) = ''; $nameStr = "<" . $nameType; if (length($last)) { $nameStr .= " lastName = " . EDGAR::XML::encode_av($last); } if (length($first)) { $nameStr .= " firstName = " . EDGAR::XML::encode_av($first); } if (length($middle)) { $nameStr .= " middleName = " . EDGAR::XML::encode_av($middle); } $nameStr .= " fullName = "; $nameStr .= EDGAR::XML::encode_av($first ." ". $middle ." ". $last) . " >\n"; $nameStr; } # # create postal XML string # sub getPostalXML { my($street, $city, $region, $code, $country) = @_; my($postalStr) = ''; $postalStr = "\n"; if (length($street)) { $postalStr .= "". EDGAR::XML::trim($street) ."\n"; } if (length($city)) { $postalStr .= "". EDGAR::XML::trim($city) ."\n"; } if (length($region)) { $postalStr .= "". EDGAR::XML::trim($region) ."\n"; } if (length($code)) { $postalStr .= "". EDGAR::XML::trim($code) ."\n"; } if (length($country)) { $postalStr .= "". EDGAR::XML::trim($country) ."\n"; } $postalStr .= "\n"; $postalStr; } # # create telephone XML string # sub getPhoneXML { my($phoneType, $area, $number) = @_; my($phoneStr) = ""; $phoneStr = "<" . $phoneType . ">"; $phoneStr .= EDGAR::XML::trim($area) . "-" . EDGAR::XML::trim($number); $phoneStr .= "\n"; #if (length($area)) { # $phoneStr .= "area = " . EDGAR::XML::encode_av($area); #} #if (length($number)) { # $phoneStr .= " number = " . EDGAR::XML::encode_av($number); #} $phoneStr; } # # create Date XML string # sub getDateXML { my($dateType, $month, $day, $year) = @_; my($dateStr) = ""; if (length($year)) { $dateStr .= "<$dateType "; if (length($day)) { $dateStr .= "day = " . EDGAR::XML::encode_av($day); } if (length($month)) { $dateStr .= " month = " . EDGAR::XML::encode_av($month); } $dateStr .= " year = " . EDGAR::XML::encode_av($year); $dateStr .= " />\n"; } $dateStr; } 1; # keep require happy =head1 NAME edgardoc-util.pl - EDGAR parsing routines =head1 PACKAGE EDGARdoc::util =head1 SYNOPSIS require edgardoc-util.pl; =head1 REQUIRES Perl, version 5.001 or higher. =head1 DESCRIPTION EDGAR parsing routines. =over 3 =head1 METHODS =head2 findNoCols =item * obtains the number of columns in a row. =item * The number of columns is returned. =item example: my ($numColumns) = &EDGARdoc::util::findNoCols(@columns); =head2 fixText =item * removes unnecessary characters and data which would complicate skulking. =item * The fixed text is returned. =item example: my ($fixedText) = &EDGARdoc::util::fixText($text); =head2 getFormFooterInfo =item * parses out the footer information (signature and signature date). =item * Returns the signature and signature date. =item example: my ($sig, $sigDate) = &EDGARdoc::util::getFormFooterInfo($text); =head2 getFormInitialQuestionsInfo =item * obtains the data from the initial EDGER form questions. =item * The initial question data is returned, if it exists, which includes: first name, middle name, last name, street address, city, state, zip code, issuer name, issuer ticker, IRS id, event month, event day, event year, amendment month, amendment day, amendment year, director, owner, officer, other, relationship addendum, individual filing, joint filing =item example: my (@initialQues) = &EDGARdoc::util::getFormInitialQuestionsInfo($text); =head2 getFormHeaderInfo =item * obtains the header information. =item * The following are returned: whether or not they're subject to Section 16, if form 3 holdings are reported, and whether form 4 transactions are reported. =item example: my ($section16, $form3, $form4) = &EDGARdoc::util::getFormHeaderInfo($text); =head2 getAsciiTableData =item * ASCII table data is reformatted and put into question format. =item * String is returned with ASCII table formatting removed. =item example: my ($normalText) = &EDGARdoc::util::getAsciiTableData($tableText); =head2 getTableData =item * obtains a data element from a parsed table array. =item * A data element is returned. =item example: my ($value) = EDGARdoc::util::getTableData('Name and Address', $maxrow, $maxcol, @data); =head2 removeCaptionTag =item * removes EDGAR table tags. =item * The "fixed" table is returned. =item example: my ($fixedTable) = &EDGARdoc::util::removeCaptionTag($table); =head2 parseTable =item * puts an EDGAR table into a two dimensional array. =item * The number of rows, columns, and the data are returned. =item example: ($maxrow, $maxcol, @data) = EDGARdoc::util::parseTable($table); =head2 isChecked =item * determines if a question has been checked. =item * Example Question: _X_ I'm a programmer. =item * A "checked" question will return a 1, otherwise a 0 is returned. =item example: $officer = &EDGARdoc::util::isChecked("Officer", $issuerInfo); =head2 getDateParts =item * This routine takes a date string and breaks it into day, month, and year. =item * day, month, and year are returned =item example: ($month, $day, $year) = EDGARdoc::util::getDateParts($eventDate); =head2 toNumber =item * Removes extraneous characters from number data. =item * a number is returned =item example: $percentClass = EDGARdoc::util::toNumber($percentClass); =head2 getBeneficial1StartXML =item * creates XML string for beneficial 1 data. =item * Beneficial 1 XML string is returned. =item example: $textOut .= EDGARdoc::util::getBeneficial1StartXML($amendMonth, $amendYear, $notSubjectToSection16, $form3HoldingsReported, $form4TransactionsReported); =item XML: =head2 getNatureOfOwnershipXML =item * creates XML string for nature of ownership data. =item * Nature of Ownership XML string is returned. =item example: $textOut = EDGARdoc::util::getNatureOfOwnershipXML($nowData); =item XML: Inherited from father =head2 getRelationXML =item * creates XML string for relationship to issuer data. =item * Relationship to issuer XML string is returned. =item example: $textOut .= EDGARdoc::util::getRelationXML($director, $officer, $owner, $other, $relationAddendum); =item XML: =head2 getOwnershipFormXML =item * creates XML string for ownership. =item * Ownership XML string is returned. =item example: $textOut = EDGARdoc::util::getOwnershipFormXML($ownershipForm); =item XML: =head2 getNameXML =item * creates XML string for name data. =item * Name XML string is returned. =item example: $textOut = &EDGARdoc::util::getNameXML("filer", $firstName, $middleName, $lastName); =item XML: =head2 getPostalXML =item * creates XML string for postal data. =item * postal XML string is returned. =item example: $textOut .= &EDGARdoc::util::getPostalXML($address, $city, $state, $zipCode); =item XML: 100 Main Street Los Angelos CA 90090 USA =head2 getPhoneXML =item * creates XML string for phone data. =item * Phone XML string is returned. =item example: $xml = EDGARdoc::util::getPhoneXML("phone", $area, $number); =item XML: 703-222-1111 =head2 getDateXML =item * creates XML string for date data. =item * Date XML string is returned. =item example: $xml = EDGARdoc::util::getDateXML("marketDate", $month, $day, $year); =item XML: =back =head1 COPYRIGHT Copyright 1999 Invisible Worlds. =cut