#!/iw/perl/current/bin/perl # # file: fix-cbd # desc: clean up CBD files for XML conversion # # limit scope...keep this first package CBD::fixcbd; eval 'exec /iw/perl/current/bin/perl -s $0 ${1+"$@"}' if 0; use strict; use vars qw($VERSION $RELEASE); # # current module verion my $Id =<<'EoI'; # $Id: //depot/isms/skulker/cbd/1.1/bin/fix-cbd#4 $ EoI # my $RELEASE = sprintf("%d", $Id =~ /^# \$Id: .*#(\d+)/); my $VERSION = "1.1"; BEGIN { my $iw_root = ($ENV{IW}) ? $ENV{IW} : "/iw"; my $cbd_root = "$iw_root/skulker/cbd/current"; # where we find our local libraries my($libdir) = "$cbd_root/lib"; if ( -d $libdir) { unshift(@INC, $libdir); } # general CBD data types require 'cbd-lib.pl'; # general CBD utility support routines require 'cbd-util.pl'; # general EDGAR XML support routines require 'cbd-xml.pl'; } # we want perl 5.00x or later require 5.004; # who am i? my $prog = ""; ($prog = $0) =~ s#.*/##; # for processing command line options use Getopt::Std; # process command line options, if any use vars qw($opt_D $opt_b); getopts('D:b:'); # where to place submissions my($basedir) = defined($opt_b) ? "$opt_b" : $CBD::lib::cbd_root; CBD::util::makepath($basedir, 0775); # debug mode? my($debug) = defined($opt_D) ? $opt_D : 0; # default DOCTYPE for CBD nc documents my($DOCTYPE) = qq||; FILE: foreach my $file (@ARGV) { my(@submission) = (); print STDERR "Processing $file ...\n" if ($debug); open(IN, "$file") || (warn "$prog: error reading $file: $!\n", next FILE); # slurp in the whole file while () { chomp; # already *fixed*? # next FILE if (/^]*?)&/<$1/g) { ; } # remove known processing instruction elements $line =~ s@@@g; $line =~ s@]*>@@g; $line =~ s@@@g; $line =~ s@]*>@@g; $line =~ s@@@g; $line =~ s@@@g; $line =~ s@]*>@@g; $line =~ s@@@g; $line =~ s@]*>@@g; $line =~ s@@@g; $line =~ s@@@g; # encode XML entities $line = CBD::XML::encode($line); # clean up non-element markup while ($line =~ /<(\/?)([^>\s]+)/g) { my $end = $1; my $tag = $2; if ($tag !~ /^($cbd_tags)$/) { my $mtag = quotemeta $tag; $line =~ s/<$end$mtag/<$end$tag/; } } # special case handling -- problems seen in a few documents $line =~ s@='(\w+)>@='$1'>@g; # Extracting embedded links # my($link); # ($link) = ($line =~ /(http:\/\/.*?\s|\n)/); # ($link) = ($link =~ /(.*\w|\d|\/)/); # if($line =~ /\s*$link/) # { # print $link, "-", $linecount, " - Foo\n"; #$line =~ s/$link/$link$link/; # } # else # { # print $link, "-", $linecount, "\n"; # } $linecount++; } # need to force DTD info for initial conversion/validation unshift(@submission, $DOCTYPE); # write the (possibly) modified content $ofile = (split(/\./, $ofile))[0] . ".nc"; open(OUT, ">$ofile") || die "$prog: error writing $ofile: $!\n"; print OUT join("\n", @submission), "\n"; } # c'ya exit 0; =head1 NAME fix-cbd - Performs CBD pre-processing =head1 PACKAGE CBD::fixcbd =head1 REQUIRED Perl, version 5.001 or higher. =head1 DESCRIPTION Accepts raw CBD data file name as command line parameter and performs some fixes on the data. =over 3 =item usage: fix-cbd Example: fix-cbd 01NO99.CBD. This will result in 01NO99.nc =back =head1 COPYRIGHT Copyright 1999 Invisible Worlds. =head1 AUTHOR CBD::XML was written by Gautam Yegnanarayan . =cut