#!/iw/perl/current/bin/perl # # file: cbd2dtd # desc: Get the final XML output # # limit scope package CBD::cbddtd; use strict; use vars qw($VERSION $RELEASE); # # current module verion my $Id =<<'EoI'; # $Id: //depot/isms/skulker/cbd/1.1/bin/cbd2dtd#4 $ EoI # my $RELEASE = sprintf("%d", $Id =~ /^# \$Id: .*#(\d+)/); my $VERSION = "1.1"; BEGIN { my $iw_root = ($ENV{IW}) ? $ENV{IW} : "/iw"; my $cbd_root = "$iw_root/skulker/cbd/current"; # where we find our local libraries my($libdir) = "$cbd_root/lib"; if ( -d $libdir) { unshift(@INC, $libdir); } # CBD LIB support require 'cbd-lib.pl'; } # Get Date parts from a string sub getDateParts { my($dateStr) = shift; my($month, $day, $year); my(@Months) = ('', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'); # remove an extra dash or colon in date info $dateStr =~ s/-/ /g; $dateStr =~ s/:/ /g; $dateStr =~ s/'/ /g; $dateStr =~ s@@@g; # remove extra white space in date info $dateStr =~ s@\s{2,}@ @g; if ($dateStr =~ /\//) { # month/day/year format ($month, $day, $year) = split(/\//, $dateStr); # in some cases, no day is given if (! $year) { $year = $day; $day = ""; } } elsif ($dateStr =~ /,/) { # month day, year format ($month, $day, $year) = $dateStr =~ /(.*?) (\d{1,2}),\s*(\d{2,4})/; } else { ($month, $day, $year) = split(/\s+/, $dateStr); } # month should be written out, so if a number make the change if ($month =~ /\d+/) { $month = $Months[$month]; } # remove anything other than numbers from year and day $year =~ s/\D//g; $day =~ s/\D//g; # pad digits to years, giving a 4 number year if (length($year) == 2) { if ($year > 70) { $year = "19" . $year; } else { $year = "20" . $year; } } elsif ((length($year) == 6) || (length($year) == 5)) { ($day, $year) = $year =~ /(\d{1,2})(\d{4})/; } # pad a zero to the front, if the day is a single number if (length($day) == 1) { $day = "0" . $day; } return ($month, $day, $year); } # Get Address parts sub getAddressParts { my($address) = @_; my(@country) = ('Liechtenstein', 'Japan'); my(@states) = ('New Jersey', 'NJ', 'New York', 'NY', 'California', 'CA', 'Virginia', 'VA', 'Nebraska', 'NE', 'Florida', 'FL', 'Oklahoma', 'OK', 'Massachusetts', 'MA', 'Utah', 'UT', 'ILLINOIS', 'IL', 'Kentucky', 'KY', 'Georgia', 'GA', 'Alabama', 'AL', 'Connecticutt', 'CT', 'Texas', 'TX', 'Pennsylvania', 'PA', 'Delaware', 'DE', 'Mississippi', 'MS', 'Minnesota', 'MN', 'Iowa', 'IA', 'Missouri', 'MO', 'Wisconsin', 'WI', 'Maryland', 'MD', 'North Carolina', 'NC', 'South Carolina', 'SC', 'Indiana', 'IN', 'West Virginia', 'WV', 'Ohio', 'OH', 'North Dakota', 'ND', 'South Dakota', 'SD', 'Washington', 'WA', 'Oregon', 'OR', 'Arizona', 'AZ', 'Alaska', 'AK', 'Vermont', 'VT', 'New Hampshire', 'NH', 'Tennesse', 'TN', 'Lousiana', 'LA', 'Kansas', 'KS', 'Arkansas', 'AR', 'Wyoming', 'WY', 'Colorado', 'CO', 'Idaho', 'ID', 'Michigan', 'MI', 'Nevada', 'NV', 'Hawai', 'HI', 'Maine', 'ME', 'Montana', 'MT', 'DC', 'Rhode Island', 'RI', 'New Mexico', 'NM'); my($statedone) = 0; my($state, $st, $street, $city, $country, $zip, $c, $l, $linecount, $index); my($office); my(@lines); $address =~ s/_{3,}//gsi; foreach $st(@states) { if($address =~ /\b$st\b\.*,*\s*\d+/i) { $state = $st; last; } } # Check if international my($international); foreach $c(@country) { if($address =~ /$c\W*/) { $country = $c; } } # United States if(length($country) eq 0) { # Get zip code if(length($state) gt 0) { ($zip) = ($address =~ /.*$state\.*,*\s*(\d+-*\d*)/si); # Get city ($street) = ($address =~ /(.*)$state.*$zip/si); ($street) = ($address =~ /(.*)$city.*$state.*$zip/si); @lines = split(/,/, $street); $linecount = 0; foreach $l(@lines) { $linecount++; } for($index=$linecount-1;$index>=0;$index--) { if($lines[$index] =~ /\w+/) { $city = $lines[$index]; last; } } $city =~ s/\)//gsi; $city =~ s/\(//gsi; $street =~ s/$city//; if($street =~ /\w+/) { # Do nothing } else { $street = $city; $city = ""; } $office = $street; @lines = split(/,/, $office); $linecount = 0; foreach $l(@lines) { $linecount++; } for($index=$linecount-1;$index>=0;$index--) { if($lines[$index] =~ /\w+/) { $street = $lines[$index]; $street =~ s/\(//gsi; $street =~ s/\)//gsi; last; } } $office =~ s/$street//; if($office =~ /\w+/) { # Do nothing } else { $office = $street; $street = ""; } } else { $office = $address; } } else { # Get zip code ($zip) = ($address =~ /.*(\d*-*\d*)\W*$country/si); ($street) = ($address =~ /(.*)$state.*$zip/si); ($street) = ($address =~ /(.*).*$zip.*$country/si); @lines = split(/,/, $street); $linecount = 0; foreach $l(@lines) { $linecount++; } for($index=$linecount-1;$index>=0;$index--) { if($lines[$index] =~ /\w+/) { $city = $lines[$index]; last; } } $street =~ s/$city//gsi; if($street =~ /\w+/) { # Do Nothing } else { $street = $city; $city = ""; } $office = $street; @lines = split(/,/, $office); $linecount = 0; foreach $l(@lines) { $linecount++; } for($index=$linecount-1;$index>=0;$index--) { if($lines[$index] =~ /\w+/) { $street = $lines[$index]; last; } } $office =~ s/$street//gsi; if($office =~ /\w+/) { # Do nothing } else { $office = $street; $street = ""; } } $office =~ s/TO //gsi; $street =~ s/$city//gsi; $office =~ s/,/ /gsi; $street =~ s/,/ /gsi; $city =~ s/,/ /gsi; $zip =~ s/,/ /gsi; return ($office, $street, $city, $state, $zip, $country); } # Get Phone numbers sub getContactPhone { my($in) = @_; my($found) = 1; my(@phone); my($count) = 0; while($found) { if($in =~ /\(?\d\d\d\)?\s*-*\d\d\d\s*-*\d\d\d\d/g) { ($phone[$count], $in) = ($in =~ /(\(?\d\d\d\)?\s*-*\d\d\d\s*-*\d\d\d\d)(.*)/g); $count++; } else { $found = 0; } } return(@phone); } # Get Email Addresses sub getEmailAddress { my($in) = @_; my($found) = 1; my(@email); my($count) = 0; while($found) { if($in =~ /\S+?@\S+/g) { ($email[$count], $in) = ($in =~ /(\S+?@\S+)/); $count++; } else { $found = 0; } } return(@email); } # # remove trailing and leading spaces # sub trim { my($text) = shift; $text =~ s/^\s+|\s+$//g; $text; } my($XMLout, $input, $firsttag, $endtag, $reqchunk, $day, $month, $year, $date); my($link, $count, $found, $temp, $pubdate, $maildate, $issue, $type); my($pubday, $pubdayname, $pubmonth, $pubyear, $mailday, $mailmonth, $mailyear); my($classcod, $code, $desc, $offadd, $newclasscod, $street, $city, $region, $zip); my($country, $newoffadd, $subject, $newsubject, $respdate, $newrespdate, $newlink); my($uri, $email, $newemail, $address, $awarddate, $newawarddate); my($awardee, $newawardee, $contact, $newcontact, $newdesc, $u, $office); my(@url, @phone, @personname, @persontitle, @mail); my(@req, @lines, @contacts); my($recd, $submitted, $submissionnumber, $origsub, $subpart, $subnum, $solnbr); my(@Months) = ('', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'); my($debug) = 0; # for all files do FILE: foreach my $file(@ARGV) { print STDERR "Processing $file ...\n" if ($debug); #$XMLout = "\n"; $input = ""; open(IN, "$file") || (warn "** error reading $file: $!\n", next FILE); while() { $input .= $_; } # Get some header information ($issue) = ($input =~ /(.*)<\/ISSNBR>/s); ($pubdate) = ($input =~ /(.*)<\/PUBDATE>/s); ($maildate) = ($input =~ /(.*)<\/MAILDATE>/s); ($pubdayname) = ($pubdate =~ /(.*)<\/DAY>/s); ($pubdate) = ($pubdate =~ /(.*)<\/DATE>/s); $maildate =~ s/\s*MAILED ON\s*//g; ($pubmonth, $pubday, $pubyear) = &getDateParts($pubdate); ($mailmonth, $mailday, $mailyear) = &getDateParts($maildate); # Get rid of useless data ($firsttag) = ($input =~ /.*?(|.*?<\/DATE>)/s); $reqchunk =~ s/$date//g; ($month, $day) = ($date =~ /.*(\d\d)(\d\d).*/); $month = $Months[$month]; ($year) = ($reqchunk =~ /(.*?<\/YEAR>)/s); $reqchunk =~ s/$year//g; $year =~ s/<\/YEAR>//s; $year =~ s///s; if($year < 70) { $year = 2000+ $year; } elsif($year < 100) { $year = 1900 + $year; } # Additional cleanup $reqchunk =~ s/$firsttag.*?>//; $reqchunk =~ s/$endtag//; # Spit out recd ($recd) = ($reqchunk =~ /(.*?<\/RECD>)/s); ($recd) = ($recd =~ /(.*)<\/RECD>/s); ($recd) = ($recd =~ /(\w-\d\d\d SN\d\d\d\d\d\d)/); $recd = &trim($recd); ($submitted, $submissionnumber) = ($recd =~ /(\S+)\s(.*)/); $origsub = $submitted; $origsub =~ s/\(//gsi; $origsub =~ s/\)//gsi; $origsub =~ s/-//gsi; $origsub =~ s/[a-zA-Z]//gsi; if($submitted =~ /I/) { $submitted = "provider-email"; } elsif($submitted =~ /D/) { $submitted = "direct-email"; } elsif($submitted =~ /W/ ) { $submitted = "web"; } elsif($submitted =~ /M/) { $submitted = "manuscript"; } elsif($submitted =~ /A/) { $submitted = "ftp"; } $submissionnumber =~ s/\)//g; # Fix ClassCod ($classcod) = ($reqchunk =~ /(.*?<\/CLASSCOD>)/s); ($desc) = ($classcod =~ /(.*)<\/CLASSCOD>/s); $desc = trim($desc); if($desc =~ /^(\S+) - (.*)$/) { ($code, $desc) = ($1, $2); } $newclasscod = "\n"; if($code =~ /[A-Z]/) { $subpart = "services"; } elsif($code =~ /[0-9]/) { $subpart = "supplies"; } $recd =~ s/\(//g; $recd =~ s/\)//g; # start spitting out into xmlout $subnum = $submissionnumber; $subnum =~ s/SN//gsi; $XMLout = ""; $XMLout .= "\n\n"; $XMLout .= "\n"; $XMLout .= "\n"; $XMLout .= "\n"; $XMLout .= "\n"; # Spit out pubdate, maildate and issue $XMLout .= "\n"; $XMLout .= "\n"; $issue =~ s/Issue No\.//gsi; $issue = &trim($issue); $XMLout .= "$issue\n"; $XMLout .= "\n"; $XMLout .= "\n"; # Spit out Date $XMLout .= "\n"; # Spit out zip ($zip) = ($reqchunk =~ /(.*?<\/ZIP>)/); $zip =~ s///; $zip =~ s/<\/ZIP>/<\/zip>/; $XMLout .= "$zip\n"; $XMLout .= "$newclasscod\n"; # Fix Offadd ($offadd) = ($reqchunk =~ /(.*?<\/OFFADD>)/s); ($newoffadd) = ($offadd =~ /(.*)<\/OFFADD>/s); ($office, $street, $city, $region, $zip, $country) = &getAddressParts($newoffadd); $newoffadd = "\n"; $newoffadd .= "\n"; $street = &trim($street); $newoffadd .= "$street\n"; $city = &trim($city); $newoffadd .= "$city\n"; $region = &trim($region); $newoffadd .= "$region\n"; $zip = &trim($zip); $newoffadd .= "$zip\n"; if($country =~ /\w+/) { $country = &trim($country); $newoffadd .= "$country\n"; } $newoffadd .= "\n"; $office = &trim($office); $newoffadd .= "$office\n"; $newoffadd .= "\n"; if($offadd =~ /\w+/) { $XMLout .= "$newoffadd\n"; } # Fix subject ($subject) = ($reqchunk =~ /(.*?<\/SUBJECT>)/s); ($desc) = ($subject =~ /(.*)<\/SUBJECT>/s); $desc = &trim($desc); if($desc =~ /^(\S+) - (.*)$/) { ($code, $desc) = ($1, $2); } $newsubject = "\n"; if($subject =~ /\w+/) { $XMLout .= "$newsubject\n"; } # Spit out solnbr ($solnbr) = ($reqchunk =~ /(.*?<\/SOLNBR>)/s); $solnbr =~ s///; $solnbr =~ s/<\/SOLNBR>/<\/solicit.number>/; $XMLout .= "$solnbr\n"; # Fix respdate ($respdate) = ($reqchunk =~ /(.*?<\/RESPDATE>)/s); ($newrespdate) = ($respdate =~ /(.*)<\/RESPDATE>/s); $newrespdate =~ s/DUE\s*//g; ($month, $day, $year) = ($respdate =~ /(\d\d)(\d\d)(\d+)/); $month = $Months[$month]; if($year < 70) { $year = 2000 + $year; } elsif($year > 70 && $year < 100) { $year = 1900 + $year; } $newrespdate = "\n"; if($respdate =~ /\w+/) { $XMLout .= "$newrespdate\n"; } # Fix contact ($contact) = ($reqchunk =~ /(.*?<\/CONTACT>)/s); ($newcontact) = ($contact =~ /(.*)<\/CONTACT>/s); # @phone = &getContactPhone($newcontact); # @mail = &getEmailAddress($newcontact); $newcontact =~ s/;/;POC /; $newcontact =~ s/ or / or POC/i; @contacts = split(/,| ; | or | at |\//, $newcontact); $newcontact = "\n"; my(@splitlines); my($prev, $firstname, $middlename, $lastname, $s, $far, $url1, $url2); my($dfar, $sic, $awdnbr, $awdamt, $linenbr); my($outfilename, $dir); foreach my $u (@contacts) { if($u =~ /POC/i || $u =~ /Points* of contact/i || $u =~ /Contact Point/i) { if($prev =~ /\(?\d\d\d\)?\s*-*\d\d\d\s*-*\d\d\d\d/) { $newcontact .= "/>\n"; $prev = ""; } $u =~ s/POC//; $u =~ s/Points* of contact//gsi; $u =~ s/Contact Point//gsi; $u = &trim($u); ($firstname, $middlename, $lastname) = split(/\s/, $u); if($lastname =~ /\w+/) { # Do nothing } else { $lastname = $middlename; $middlename = ""; } $newcontact .= "\n"; } elsif($u =~ /\(?\d\d\d\)?\s*-*\d\d\d\s*-*\d\d\d\d/) { if($prev =~ /\(?\d\d\d\)?\s*-*\d\d\d\s*-*\d\d\d\d/) { $prev = ""; } if($u =~ /Fax/si) { ($u) = ($u =~ /(\(?\d\d\d\)?\s*-*\d\d\d\s*-*\d\d\d\d)/); $u =~ s/Fax//si; $u = &trim($u); $newcontact .= "$u" . "\n"; } else { ($u) = ($u =~ /(\(?\d\d\d\)?\s*-*\d\d\d\s*-*\d\d\d\d)/); $u =~ s/Phone//si; $u = &trim($u); $newcontact .= "$u" . "\n"; } $prev = $u; } elsif($u =~ /\S+@\S+/) { if($prev =~ /\(?\d\d\d\)?\s*-*\d\d\d\s*-*\d\d\d\d/) { $newcontact .= "/>\n"; $prev = ""; } $u =~ s/Email//i; ($u) = ($u =~ /(.*@.*\.\w\w\w?)/); $u =~ s/"//gsi; $u = &trim($u); $newcontact .= "\n"; } else { if($prev =~ /\(?\d\d\d\)?\s*-*\d\d\d\s*-*\d\d\d\d/) { ## $newcontact .= "/>\n"; $prev = ""; } $u = &trim($u); $newcontact .= "$u\n"; } if($prev =~ /\(?\d\d\d\)?\s*-*\d\d\d\s*-*\d\d\d\d/) { ## $newcontact .= "/>\n"; $prev = ""; } } $newcontact .= "\n"; if($contact =~ /\w+/) { $XMLout .= "$newcontact\n"; } # Fix desc ($desc) = ($reqchunk =~ /(.*?<\/DESC>)/s); ($desc) = ($desc =~ /(.*)<\/DESC>/); # Fix split lines @splitlines = split(/\n/, $desc); $desc = ""; foreach $s(@splitlines) { $desc .= $s; } $s =~ s/\!\!//gsi; $newdesc = "\n"; # Skulked stuff $newdesc .= "\n"; # Look for links $found = 1; $temp = $desc; while($found) { if($temp =~ /http:\/\/\S+/) { ($link, $temp) = ($temp =~ /(http:\/\/\S+)(.*)/s); $link =~ s/\(//; $link =~ s/\)//; $link =~ s/\*//g; $link =~ s/mdash;/_/gsi; $link =~ s/llmdash;/_/gsi; $link =~ s/"//gsi; ($link) = ($link =~ /(.*\w)/); if($newdesc !~ /$link/) { $newdesc .= "\n"; if($debug) { print STDERR $link, "\n"; } } $link = ""; } else { $found = 0; } } # Look for email addresses $found = 1; $temp = $desc; while($found) { if($temp =~ /\S+@\S+/) { ($email, $temp) = ($temp =~ /(\S+@\S+)(.*)/); ($email) = ($email =~ /(.*@.*\.[a-zA-Z][a-zA-Z][a-z][A-Z]?)/); $email =~ s/\(//; $email =~ s/\)//; $email =~ s/\*//g; $email =~ s/\[//g; $email =~ s/\]//g; $email =~ s/"//gsi; $email =~ s/.*://gsi; if($email =~ /@.*\.[a-zA-Z][a-zA-Z][a-zA-Z]?$/) { if($newdesc !~ /$email/ && $email =~ /\w+/) { $newdesc .= "\n"; if($debug) { print STDERR $email, "\n"; } } $email = ""; } } else { $found = 0; } } # Look for FARs $found = 1; $temp = $desc; while($found) { if($temp =~ /FAR \d+\.\d+/) { ($far, $temp) = ($temp =~ /(FAR .*?\s)(.*)/); ($url1, $url2) = ($far =~ /(\d+)\.(\d+)/); $url2 = $url1. "." . $url2; if(length($url1) eq 1) { $url1 = "0".$url1; } if($far =~ /\d+/ && $newdesc !~ /$url1\.html#$url2/) { $newdesc .= "\n"; if($debug) { print STDERR $far, "\n"; } } $far = ""; } else { $found = 0; } } # Looking for DFARS $found = 1; $temp = $desc; while($found) { if($temp =~ /DFARS* \d+\.\d+/) { ($dfar, $temp) = ($temp =~ /(DFARS* .*?\s)(.*)/); ($url1, $url2) = ($dfar =~ /(\d+)\.(\d+)/); if($dfar =~ /\d+/) { $newdesc .= "\n"; } $dfar = ""; } else { $found = 0; } } # Looking for SICs $found = 1; $temp = $desc; while($found) { if($temp =~ /SIC.*?\d\d\d\d/) { ($sic, $temp) = ($temp =~ /(SIC.*?\d\d\d\d)(.*)/s); ($sic) = ($sic =~ /.*(\d\d\d\d)/); if($debug) { print STDERR $sic, "\n"; } $sic = "http://edgar.space.invisible.net/help/edgar/sic/sic_" . $sic . ".html"; if($newdesc !~ /$sic/) { $newdesc .= "\n"; } } else { $found = 0; } } $newdesc .= "<\/skulked.desc>\n"; # Unskulked stuff $newdesc .= "\n"; $newdesc .= "$desc\n"; $newdesc .= "\n"; $newdesc .= "\n"; $XMLout .= "$newdesc\n"; # Fix link ($link) = ($reqchunk =~ /(.*?<\/LINK>)/s); ($uri) = ($link =~ /(.*)<\/URL>/s); ($desc) = ($link =~ /(.*)<\/DESC>/s); $uri =~ s/\n/ /gsi; $desc =~ s/\n/ /gsi; $uri = &trim($uri); $uri =~ s/mdash;/_/gsi; $uri =~ s/llmdash;/_/gsi; $desc = &trim($desc); $newlink = "\n"; if($link =~ /\w+/) { $XMLout .= "$newlink\n"; } # Fix email ($email) = ($reqchunk =~ /(.*?<\/EMAIL>)/s); ($address) = ($email =~ /
(.*)<\/ADDRESS>/s); ($desc) = ($email =~ /(.*)<\/DESC>/s); $address =~ s/\n/ /gsi; $desc =~ s/\n/ /gsi; $address = &trim($address); $desc = &trim($desc); $newemail = "\n"; if($email =~ /\w+/) { $XMLout .= "$newemail\n"; } # Spit out awdnbr ($awdnbr) = ($reqchunk =~ /(.*?<\/AWDNBR>)/s); $awdnbr = &trim($awdnbr); $awdnbr =~ s///; $awdnbr =~ s/<\/AWDNBR>/<\/award.number>/; $XMLout .= "$awdnbr\n"; # Spit out awardamount ($awdamt) = ($reqchunk =~ /(.*?<\/AWDAMT>)/s); ($awdamt) = ($awdamt =~ /(.*)<\/AWDAMT>/s); $awdamt =~ s/AMT //gsi; $awdamt =~ s/\$//gsi; $awdamt =~ s/,//gsi; $awdamt = &trim($awdamt); if($awdamt =~ /\d+/) { $awdamt = "\n"; $XMLout .= "$awdamt\n"; } # Spit out linenbr ($linenbr) = ($reqchunk =~ /(.*?<\/LINENBR>)/s); $linenbr = &trim($linenbr); $linenbr =~ s///; $linenbr =~ s/<\/LINENBR>/<\/line.number>/; $XMLout .= "$linenbr\n"; # Fix awarddate ($awarddate) = ($reqchunk =~ /(.*?<\/AWDDATE>)/s); ($newawarddate) = ($awarddate =~ /(.*)<\/AWDDATE>/s); ($month, $day, $year) = ($newawarddate =~ /(\d\d)(\d\d)(\d+)/); $month = $Months[$month]; if($year < 70) { $year = 2000 + $year; } elsif($year > 70 && $year < 100) { $year = 1900 + $year; } $newawarddate = "\n"; if($awarddate =~ /\w+/) { $XMLout .= "$newawarddate\n"; } # Fix awardee ($awardee) = ($reqchunk =~ /(.*?<\/AWARDEE>)/s); ($newawardee) = ($awardee =~ /(.*)<\/AWARDEE>/s); ($office, $street, $city, $region, $zip, $country) = &getAddressParts($newawardee); $newawardee = "\n"; $newawardee .= "\n"; $street = &trim($street); $newawardee .= "$street\n"; $city = &trim($city); $newawardee .= "$city\n"; $region = &trim($region); $newawardee .= "$region\n"; $zip = &trim($zip); $newawardee .= "$zip\n"; if($country =~ /\w+/) { $country = &trim($country); $newawardee .= "$country\n"; } $newawardee .= "\n"; $office = &trim($office); $newawardee .= "$office\n"; $newawardee .= "\n"; if($awardee =~ /\w+/) { $XMLout .= "$newawardee\n"; } $XMLout .= "\n\n"; $XMLout .= "\n"; $XMLout .= "\n"; $XMLout .= "\n\n"; # Stuff in cbd name # Perform encoding ## TODO: always use xml lib encoding routine ## $XMLout =~ s/llmdash;//gsi; ## $XMLout =~ s/mdash;//gsi; ## $XMLout =~ s/emsp;/ /gsi; ## $XMLout =~ s/ensp;/ /gsi; ## $XMLout =~ s/gt;/\>/gsi; ## $XMLout =~ s/lt;/\</gsi; ## $XMLout =~ s/amp;/\&/gsi; ## $XMLout =~ s/ndash;//gsi; ## $XMLout =~ s/&&/&/gsi; ## $XMLout =~ s/&quot;/"/gsi; ## TODO: this should be a function $outfilename = $recd; $outfilename =~ s/\(//gsi; $outfilename =~ s/\)//gsi; $outfilename .= ".xml"; $dir = "/data/cbd/xml/"; $dir .= "$pubyear" . "/"; ## TODO: just build the path then use makepath() if(!opendir(DIR,$dir)) { # Create the directory mkdir($dir, 0777); } else { closedir(DIR); } $dir =~ s/-//gsi; $dir =~ s/ /_/gsi; $dir .= "$pubmonth" . "/"; if(!opendir(DIR,$dir)) { # Create the directory mkdir($dir, 0777); } else { closedir(DIR); } ## $dir =~ s/-//gsi; ## $dir =~ s/ /_/gsi; ## ($file1) = ($file =~ /.*\/(.*)\.xml/); ## $dir .= "$file1" . "/"; ## $dir =~ s/ //gsi; ## $dir =~ s/-//gsi; ## $dir =~ s/ /_/gsi; ## if(!opendir(DIR,$dir)) ## { ## # Create the directory ## mkdir($dir, 0777); ## } ## else ## { ## closedir(DIR); ## } $dir .= "$outfilename"; $dir =~ s/-//gsi; $dir =~ s/ /_/gsi; if($outfilename =~ /SN\d+/) { # output filename for live CBD processing print $dir, "\n"; open (OUT, ">$dir") or warn "cant open file"; print OUT $XMLout, "\n"; close OUT; } else { print STDERR "Error! RECD tag is empty in $file\n"; } # $XMLout = "\n"; # print STDERR $reqchunk, "\n"; # print STDERR "-----------------------------------------------------------------\n"; # get next start tag ($firsttag) = ($input =~ /.*?(. =cut