""" Found this here: http://herbie.bl.uk:9080/Gateway/ British Library SRU Gateway Software, licensed under the LGPL: http://www.opensource.org/licenses/lgpl-license.php Currently we are not using this library for anything. (10/10/2007) -- Joel Hardi """ # This module defines a routine for decomposing ISO2709 records to a processable # MARC format. There are routines that allow various pieces of data to be extracted # from the MARC record. These work for both Unimarc and MARC records. # When initialises a record, the character set parameter defines the base character set # which will be converted to Unicode. import re class SUTRS : def __init__(self,data,charset="utf-8") : # a list of label,value pairs self.charset="utf-8" data = data.replace("&","&").replace("<","<") self.data = [j.replace("\t","").split(":",1) for j in data.split("\n") if j <> "" ] self.fields= {} for i in self.data : if len(i) == 2 : self.fields[i[0]] = i[1] #routines for displaying and outputting the record def showRecord(self): print self.listRecord() def listRecord(self): t = "" return "\n".join([ "%s : %s" % (i, self.fields[i]) for i in self.fields ]) def labelAsXML (self, labels, element="", attributes="", alternative="", codeToText={}): # labels is a list of entries for each label to be included [ label, start , end , wrapText ] xml="" newAttributes="" # add a space before a newAttribute for e in labels : label, start , end , wrapText = e if self.fields.has_key(label): text = self.fields[label] if end == 0 : end = len(text) text = text[start:end] if codeToText : text = codeToText.get(text,text) xml += wrapText % text if xml : return "<%s %s%s>%s\n" % (element,attributes,newAttributes,xml,element) return alternative def present(self, label , TrueData="", FalseData = "" ) : if self.fields.has_key(label) : return TrueData return FalseData class XML : def __init__ (self,data,wrapper): self.data=data self.record=data[data.find(">",data.find("<"+wrapper))+1:data.find("")] def showRecord (self): print self.data class ISO2709 : '''A MARC record of some kind, the data being converted to Unicode utf-8 depending on the initial character set defined in a paramter. ''' def __init__(self,data,charset="ansel",recordProcess="ISO 2709") : self.taglist=["000"] # a list of tags in the record self.fields=[["000","||",["a"+data[0:24]],["a"]]] # the data # tag, indicators, subfield data as list, list of subfields # this sets up the header data as tag 000 self.tags={"000":[0]} # a dictionary of tags, with fields index charsets = { "ansel" : AnselToUTF8(), "ISO5426" : convertToUTF8("ISO5426"), "ILS" : ILSConvert(), "utf-8" : NoConvert() , "latin1" : LatinConvert(), "cp1252" : cp1252Convert() , "cp1250" : codecConvert() } charConverter= charsets.get(charset, None) self.setAttributes(data) # initialise the first field as the header, tag = 000, data in subfield a # split data into fields using the field separator # scan through the directory for each field, add each tag to a list directoryData=data[24:].split("\x1e",1)[0] # now for each tag add the tag, indicators, subfields # and list of subfield codes to the taglist and the fields dictionary a=0 for i in range(0,len(directoryData),12) : # step through the directory entries entry = directoryData[i:i+12] tag = entry[0:3] self.taglist.append(tag) start = self.baseAddr+int(entry[7:]) end = start + int(entry[3:7]) a+=1 # the fields start at 1, 0 is the directory # process the tags less that 010, place defaults for inds and sub as a if tag < "010" : inds="||" # indicators preset for fields below 010 self.fields+=[[tag,"||",["a"+data[start:end-1]],["a"]]] self.tags[tag]=[a] else : # process other tags normally sublist=[] text=data[start:end-1] if charConverter : text = charConverter.swap(text) subs = text.split("\x1f") substhere = [] for i in subs[1:] : if i<>"" : sublist+=i[0] ; substhere += [i] self.fields+=[[tag,subs[0],substhere,sublist]] # add the tag to the tag index with the index of the field in taglist if self.tags.has_key(tag): self.tags[tag]+=[a] else : self.tags[tag]=[a] #routines for displaying and outputting the record def showRecord(self): print self.taglist for i in self.fields : print i print self.tags def listRecord(self): t = "" for i in self.fields: t += "Tag %s / %s ( %s )\n" % (i[0], i[1], i[3]) t += "".join(["--- %s - %s\n" % (j[0] , j[1:]) for j in i[2] if j<>"" ]) return t def __str__(self) : return "\n".join([ str(i) for i in self.fields]) def hasTag(self,tag) : if tag in self.tags : return self.tags[tag] else : return [] def conditional (self,outputTest,trueString,falseString) : if outputTest : return trueString return falseString # see if a subfield contains a string, and if so return true, otherwise false (with could be text strings) # this iterates a cross all repeats of tags and repeats of a subfield def contentMatch (self,matchField=[]) : if self.tags.has_key(match[0]) : for field in self.tags[match[0]]: for subfield in field[2] : if subfield[0] in match[1] : if subfield[1:].find(match[2])>-1 : return True return False def setAttributes(self,data) : self.status=data[5] self.type=data[6] self.level=data[7] self.hierarchy=data[8] self.encoding=data[17] self.recform=data[18] self.baseAddr=int(data[12:17]) def do856 (self) : xml=[] if self.tags.has_key("856") : for fieldNumber in self.tags["856"]: field = self.fields[fieldNumber] data = "" link = "" for subfield in field[2] : if subfield[0]=="u" : link = subfield[1:] if subfield[0] == "3" : data = subfield[1:] if data : xml.append("<%s %s>%s\n" % ("dc:description",' xlink:href="' +link+ '"',data,"dc:description")) else: if link : xml.append("<%s %s>%s\n" % ("dc:identifier",' xsi:type="tel:URI" xlink:href="' + link + '"',link,"dc:identifier")) return "".join(xml) def rangeAsXML (self,start,end,butNot=[],element="",attributes="", subfields="abcdefghijklmnopqrstuvwxyz", alternative="") : xml="" oldstuff = "" newAttributes="" # add a space before a newAttribute for tag in self.tags : if tag >= start and tag <= end and tag not in butNot : fields = self.tags[tag] for field in fields : stuff="" fieldContent = self.fields[field][2] # select only the required subfields for subfield in fieldContent : if subfield[0] in subfields : stuff+=subfield[1:]+". " # assemble output element, attributes, content if stuff and stuff <> oldstuff : xml += "<%s %s%s>%s\n" % (element,attributes,newAttributes,stuff,element) oldstuff = stuff # return string if xml : return xml else : return alternative def mergeTagsAsXML (self, tagList = [], element="", attributes="" , alternative = "" ) : # merges the content of several tags (only repeat 0 at present) into a single element # tag list has the format for each tag, [ [ "T or F", "tag", "subfields" , punc , posLen [] , "wraptext" ] ... ] # usually something like ["T","256","a","",[],"%s,"] or ["F","008", "" , "" , [8,4] ,"(%s)" ] xml="" for tagItem in tagList : tagType, tag, subfields , punc , posLen , wrapText = tagItem if self.tags.has_key(tag): # self.tags[tag][0] is the first field with the tag - at some stage use output to define repeats = ...[2] is content if tagType == "T" : xml += wrapText % punc.join([ subfield[1:] for subfield in self.fields[self.tags[tag][0]][2] if subfield[0] in subfields]) elif tagType == "F" : xml += wrapText % self.fields[self.tags[tag][0]][2][0][posLen[0]:posLen[1]] if xml : return "<%s %s>%s\n" % (element,attributes,xml,element) return alternative def tagAsXML (self, tags, subfields="abcdefghijklmnopqrstuvwxyz", element="", attributes="", alternative="", output="all", indicator1=False, indicator2 = False , codeToText={} , punc=" " , wrapText="%s", linker="", replace=[]): if linker <>"" : linker='xlink:href="'+linker+'"' if wrapText=="" : wrapText="%s" xml="" oldstuff = "" newAttributes=" " # add a space before a newAttribute for tag in tags : if self.tags.has_key(tag): # for each occurrence of the tag, unless "first" , or "repeat" (do the second and subsequent occurrences) fields = self.tags[tag] # just control whether output is the first field of the tag, the repeats or all the fields start=0; end=len(fields) if output=="first" : end=1 elif output=="repeats" : start=1 #now process each field for field in fields[start:end] : stuff="" # if indicator2 : newAttributes+= " "+codeToText.get(self.fields[field][1][1]," ") fieldContent = self.fields[field][2] # select only the required subfields stuff = wrapText % punc.join([ subfield[1:] for subfield in fieldContent if subfield[0] in subfields]) for i in replace : stuff = re.sub(i[0],i[1],stuff) # assemble output element, attributes, content if stuff and stuff <> oldstuff : if linker : newAttributes+= linker % punc.join([ subfield[1:] for subfield in fieldContent if subfield[0] in subfields]) xml += "<%s %s%s>%s\n" % (element,attributes,newAttributes,stuff,element) oldstuff = stuff newAttributes=" " # return string if xml : return xml else : return alternative def tagAsText (self, tags, subfields="abcdefghijklmnopqrstuvwxyz", alternative="", output="all", indicator1=False, indicator2 = False , codeToText={} , punc=" " , wrapText="%s"): xml="" oldstuff = "" newAttributes="" # add a space before a newAttribute for tag in tags : if self.tags.has_key(tag): # for each occurrence of the tag, unless "first" , or "repeat" (do the second and subsequent occurrences) fields = self.tags[tag] # just control whether output is the first field of the tag, the repeats or all the fields start=0; end=len(fields) if output=="first" : end=1 elif output=="repeats" : start=1 #now process each field for field in fields[start:end] : stuff="" # if indicator2 : newAttributes+= " "+codeToText.get(self.fields[field][1][1]," ") fieldContent = self.fields[field][2] # select only the required subfields stuff = wrapText % punc.join([ subfield[1:] for subfield in fieldContent if subfield[0] in subfields]) # assemble output element, attributes, content if stuff and stuff <> oldstuff : xml += stuff oldstuff = stuff # return string if xml : return xml else : return alternative def fixedTagAsXML(self,tag,position=[0],length=1,element="",attributes="",alternative="",codeToText={}, unknown="") : xml="" if self.tags.has_key(tag): for pos in position : subfield = self.fields[self.tags[tag][0]] [2][0] if pos + length <= len(subfield) : text=subfield[pos:pos+length] if codeToText : text=codeToText.get(text,text) if text : xml+="<%s %s>%s\n" % (element,attributes,text,element) else: xml+="<%s %s>%s\n" % (element,attributes,self.fields[self.tags[tag][0]][2][0][pos:pos+length],element) if xml : return xml else: return "" return alternative def fixedTagAsText(self,tag,position=[0],length=1,alternative="",codeToText={}, unknown="",wrapText="%s") : xml="" if self.tags.has_key(tag): for pos in position : subfield = self.fields[self.tags[tag][0]] [2][0] if pos + length <= len(subfield) : text=subfield[pos:pos+length] if codeToText : text=codeToText.get(text,text) if text : xml+=wrapText % text else: xml+=wrapText % self.fields[self.tags[tag][0]][2][0][pos:pos+length] if xml : return xml else: return "" return alternative def variableTagXML (self,variableTags=[]) : if variableTags == None : return "" return "".join([ self.tagAsXML (tags=i[0], subfields=i[1], element=i[3], attributes=i[4], wrapText=i[2]) for i in variableTags]) # routines for processing the record class NoConvert : def swap(self,c) : c = c.replace("&","&") c = c.replace("<","<") c = c.replace("\x1C","") c = c.replace("\x1D","") c = c.replace("\x1E","") # c = c.replace("?","") # c = c.replace("?","?") return c class LatinConvert : def swap(self,c) : c=unicode(c,"latin-1").encode("utf-8") c = c.replace("&","&") c = c.replace("<","<") return c # unicode(c,"latin-1").encode("utf-8") -- not needed here ? class cp1252Convert : def swap(self,c) : c = c.replace("\x1B","") c=unicode(c,"cp1252").encode("utf-8") c = c.replace("&","&") c = c.replace("<","<") return c # unicode(c,"latin-1").encode("utf-8") -- not needed here ? class codecConvert : def swap(self,c) : c = c.replace("\x1C","") c = c.replace("\x1D","") c = c.replace("\x1E","") c=unicode(c,"cp1250").encode("utf-8") c = c.replace("&","&") c = c.replace("<","<") return c # unicode(c,"latin-1").encode("utf-8") -- not needed here ? class ILSConvert : def swap (self,c): c = c.replace("&","&") c = c.replace("<","<") u = unicode(c,"ISO-8859-1") u = re.sub("\\\\[0-9A-F]{4}",self.toUni,u) return u.encode("utf-8") def toUni (self,match): a = unichr(int(match.group()[1:],base=16)) if a==u"\uFE20" : return "" if a==u"\uFE21" : a=u"\u0361" return a class AnselToUTF8 : def __init__(self) : self.anseldict = { "\xA0" : [ "\u0020" , "\x20" ],# "HARD SPACE - represented by a space" ], "\xA1" : [ "\u0141" , "\xC5\x81" ],# "UPPERCASE POLISH L / LATIN CAPITAL LETTER L WITH STROKE" ], "\xA2" : [ "\u00D8" , "\xC3\x98" ], #"UPPERCASE SCANDINAVIAN O / LATIN CAPITAL LETTER O WITH STROKE" ], "\xA3" : [ "\u0110" , "\xC4\x90" ], #"UPPERCASE D WITH CROSSBAR / LATIN CAPITAL LETTER D WITH STROKE" ], "\xA4" : [ "\u00DE" , "\xC3\x9E" ], #"UPPERCASE ICELANDIC THORN / LATIN CAPITAL LETTER THORN (Icelandic)" ], "\xA5" : [ "\u00C6" , "\xC3\x86" ], #"UPPERCASE DIGRAPH AE / LATIN CAPITAL LIGATURE AE" ], "\xA6" : [ "\u0152" , "\xC5\x92" ], #"UPPERCASE DIGRAPH OE / LATIN CAPITAL LIGATURE OE" ], "\xA7" : [ "\u02B9" , "\xCA\xB9" ], #"SOFT SIGN, PRIME / MODIFIER LETTER PRIME" ], "\xA8" : [ "\u00B7" , "\xC2\xB7" ], #"MIDDLE DOT" ], "\xA9" : [ "\u266D" , "\xE2\x99\xAD" ], #"MUSIC FLAT SIGN" ], "\xAA" : [ "\u00AE" , "\xC2\xAE" ], #"PATENT MARK / REGISTERED SIGN" ], "\xAB" : [ "\u00B1" , "\xC2\xB1" ], #"PLUS OR MINUS / PLUS-MINUS SIGN" ], "\xAC" : [ "\u01A0" , "\xC6\xA0" ], #"UPPERCASE O-HOOK / LATIN CAPITAL LETTER O WITH HORN" ], "\xAD" : [ "\u01AF" , "\xC6\xAF" ], #"UPPERCASE U-HOOK / LATIN CAPITAL LETTER U WITH HORN" ], "\xAE" : [ "\u02BE" , "\xCA\xBE" ], #"ALIF / MODIFIER LETTER RIGHT HALF RING" ], "\xB0" : [ "\u02BB" , "\xCA\xBB" ], #"AYN / MODIFIER LETTER TURNED COMMA" ], "\xB1" : [ "\u0142" , "\xC5\x82" ], #"LOWERCASE POLISH L / LATIN SMALL LETTER L WITH STROKE" ], "\xB2" : [ "\u00F8" , "\xC3\xB8" ], #"LOWERCASE SCANDINAVIAN O / LATIN SMALL LETTER O WITH STROKE" ], "\xB3" : [ "\u0111" , "\xC4\x91" ], #"LOWERCASE D WITH CROSSBAR / LATIN SMALL LETTER D WITH STROKE" ], "\xB4" : [ "\u00FE" , "\xC3\xBE" ], #"LOWERCASE ICELANDIC THORN / LATIN SMALL LETTER THORN (Icelandic)" ], "\xB5" : [ "\u00E6" , "\xC3\xA6" ], #"LOWERCASE DIGRAPH AE / LATIN SMALL LIGATURE AE" ], "\xB6" : [ "\u0153" , "\xC5\x93" ], #"LOWERCASE DIGRAPH OE / LATIN SMALL LIGATURE OE" ], "\xB7" : [ "\u02BA" , "\xCA\xBA" ], #"HARD SIGN, DOUBLE PRIME / MODIFIER LETTER DOUBLE PRIME" ], "\xB8" : [ "\u0131" , "\xC4\xB1" ], #"LOWERCASE TURKISH I / LATIN SMALL LETTER DOTLESS I" ], "\xB9" : [ "\u00A3" , "\xC2\xA3" ], #"BRITISH POUND / POUND SIGN" ], "\xBA" : [ "\u00F0" , "\xC3\xB0" ], #"LOWERCASE ETH / LATIN SMALL LETTER ETH (Icelandic)" ], "\xBC" : [ "\u01A1" , "\xC6\xA1" ], #"LOWERCASE O-HOOK / LATIN SMALL LETTER O WITH HORN" ], "\xBD" : [ "\u01B0" , "\xC6\xB0" ], #"LOWERCASE U-HOOK / LATIN SMALL LETTER U WITH HORN" ], "\xC0" : [ "\u00B0" , "\xC2\xB0" ], #"DEGREE SIGN" ], "\xC1" : [ "\u2113" , "\xE2\x84\x93" ], #"SCRIPT SMALL L" ], "\xC2" : [ "\u2117" , "\xE2\x84\x97" ], #"SOUND RECORDING COPYRIGHT" ], "\xC3" : [ "\u00A9" , "\xC2\xA9" ], #"COPYRIGHT SIGN" ], "\xC4" : [ "\u266F" , "\xE2\x99\xAF" ], #"MUSIC SHARP SIGN" ], "\xC5" : [ "\u00BF" , "\xC2\xBF" ], #"INVERTED QUESTION MARK" ], "\xC6" : [ "\u00A1" , "\xC2\xA1" ], #"INVERTED EXCLAMATION MARK" ], "\xE0" : [ "\u0309" , "\xCC\x89" ], #"PSEUDO QUESTION MARK / COMBINING HOOK ABOVE" ], "\xE1" : [ "\u0300" , "\xCC\x80" ], #"GRAVE / COMBINING GRAVE ACCENT (Varia)" ], "\xE2" : [ "\u0301" , "\xCC\x81" ], #"ACUTE / COMBINING ACUTE ACCENT (Oxia)" ], "\xE3" : [ "\u0302" , "\xCC\x82" ], #"CIRCUMFLEX / COMBINING CIRCUMFLEX ACCENT" ], "\xE4" : [ "\u0303" , "\xCC\x83" ], #"TILDE / COMBINING TILDE" ], "\xE5" : [ "\u0304" , "\xCC\x84" ], #"MACRON / COMBINING MACRON" ], "\xE6" : [ "\u0306" , "\xCC\x86" ], #"BREVE / COMBINING BREVE (Vrachy)" ], "\xE7" : [ "\u0307" , "\xCC\x87" ], #"SUPERIOR DOT / COMBINING DOT ABOVE" ], "\xE8" : [ "\u0308" , "\xCC\x88" ], #"UMLAUT, DIAERESIS / COMBINING DIAERESIS (Dialytika)" ], "\xE9" : [ "\u030C" , "\xCC\x8C" ], #"HACEK / COMBINING CARON" ], "\xEA" : [ "\u030A" , "\xCC\x8A" ], #"CIRCLE ABOVE, ANGSTROM / COMBINING RING ABOVE" ], "\xEB" : [ "\uFE20" , "\xEF\xB8\xA0" ], #"LIGATURE, FIRST HALF / COMBINING LIGATURE LEFT HALF" ], "\xEC" : [ "\uFE21" , "\xEF\xB8\xA1" ], #"LIGATURE, SECOND HALF / COMBINING LIGATURE RIGHT HALF" ], "\xED" : [ "\u0315" , "\xCC\x95" ], #"HIGH COMMA, OFF CENTER / COMBINING COMMA ABOVE RIGHT" ], "\xEE" : [ "\u030B" , "\xCC\x8B" ], # "DOUBLE ACUTE / COMBINING DOUBLE ACUTE ACCENT" ], "\xEF" : [ "\u0310" , "\xCC\x90" ], #"CANDRABINDU / COMBINING CANDRABINDU" ], "\xF0" : [ "\u0327" , "\xCC\xA7" ], #"CEDILLA / COMBINING CEDILLA" ], "\xF1" : [ "\u0328" , "\xCC\xA8" ], #"RIGHT HOOK, OGONEK / COMBINING OGONEK" ], "\xF2" : [ "\u0323" , "\xCC\xA3" ], # "DOT BELOW / COMBINING DOT BELOW" ], "\xF3" : [ "\u0324" , "\xCC\xA4" ], #"DOUBLE DOT BELOW / COMBINING DIAERESIS BELOW" ], "\xF4" : [ "\u0325" , "\xCC\xA5" ], #"CIRCLE BELOW / COMBINING RING BELOW" ], "\xF5" : [ "\u0333" , "\xCC\xB3" ], #"DOUBLE UNDERSCORE / COMBINING DOUBLE LOW LINE" ], "\xF6" : [ "\u0332" , "\xCC\xB2" ], #"UNDERSCORE / COMBINING LOW LINE" ], "\xF7" : [ "\u0326" , "\xCC\xA6" ], #"LEFT HOOK (COMMA BELOW) / COMBINING COMMA BELOW" ], "\xF8" : [ "\u031C" , "\xCC\x9C" ], #"RIGHT CEDILLA / COMBINING LEFT HALF RING BELOW" ], "\xF9" : [ "\u032E" , "\xCC\xAE" ], #"UPADHMANIYA / COMBINING BREVE BELOW" ], "\xFA" : [ "\uFE22" , "\xEF\xB8\xA2" ], # "DOUBLE TILDE, FIRST HALF / COMBINING DOUBLE TILDE LEFT HALF" ], "\xFB" : [ "\uFE23" , "\xEF\xB8\xA3" ], #"DOUBLE TILDE, SECOND HALF / COMBINING DOUBLE TILDE RIGHT HALF" ], "\xFE" : [ "\u0313" , "\xCC\x93" ], #"HIGH COMMA, CENTERED / COMBINING COMMA ABOVE (Psili)" ], } def swap(self,c): c = c.replace("\x1B","") c = c.replace("&","&") c = c.replace("<","<") o = "" a = "" for i in c : if ord(i)> 159 : if self.anseldict.has_key(i) : a += self.anseldict[i][1] else : o += "?" else : o += i + a a = "" return o class convertToUTF8 : ''' converToUTF8 provides a conversion from several character sets used in Marc records to utf-8 Usage : converter = convertToUTF8("setname") string = converter.swap(string) ''' # ISO5426 character set set = { "ISO5426" : [ { "\xA1" : u"\u00A1" ,"\xA2" : u"\u201E" ,"\xA3" : u"\u00A3" ,"\xA4" : u"\u0024" ,"\xA5" : u"\u00A5" ,"\xA6" : u"\u2020" , "\xA7" : u"\u00A7" ,"\xA8" : u"\u02B9" ,"\xA9" : u"\u2018" ,"\xAA" : u"\u201C" ,"\xAB" : u"\u00AB" ,"\xAC" : u"\u266D" , "\xAD" : u"\u00A9" ,"\xAE" : u"\u2117" ,"\xAF" : u"\u00AE" ,"\xB0" : u"\u02BF" ,"\xB1" : u"\u02BE" ,"\xB2" : u"\u201A" , "\xB6" : u"\u2021" ,"\xB7" : u"\u00B7" ,"\xB8" : u"\u02BA" ,"\xB9" : u"\u2019" ,"\xBA" : u"\u201D" ,"\xBB" : u"\u00BB" , "\xBC" : u"\u266F" ,"\xBD" : u"\u02B9" ,"\xBE" : u"\u02BA" ,"\xBF" : u"\u00BF" , "\xE1" : u"\u00C6" , "\xE2" : u"\u0110" , "\xE6" : u"\u0132" , "\xE8" : u"\u0141" , "\xE9" : u"\u00D8" , "\xEA" : u"\u0152" , "\xEC" : u"\u00DE" , "\xF1" : u"\u00E6" , "\xF2" : u"\u0111" , "\xF3" : u"\u00F0" , "\xF5" : u"\u0131" , "\xF6" : u"\u0133" , "\xF8" : u"\u0142" , "\xF9" : u"\u00F8" , "\xFA" : u"\u0153" , "\xFB" : u"\u00DF" , "\xFC" : u"\u00FE" } , # diacritics that appear after { "\xC0" : u"\u0309" , "\xC1" : u"\u0300" , "\xC2" : u"\u0301" , "\xC3" : u"\u0302" , "\xC4" : u"\u0303" , "\xC5" : u"\u0304" , "\xC6" : u"\u0306" , "\xC7" : u"\u0307" , "\xC8" : u"\u0308" , "\xC9" : u"\u0308" , "\xCA" : u"\u030A" , "\xCB" : u"\u0315" , "\xCC" : u"\u0312" , "\xCD" : u"\u030B" , "\xCE" : u"\u031B" , "\xCF" : u"\u030C" , "\xD0" : u"\u0327" , "\xD1" : u"\u0328" , "\xD2" : u"\u0321" , "\xD3" : u"\u0322" , "\xD4" : u"\u0325" , "\xD5" : u"\u032E" , "\xD6" : u"\u0323" , "\xD7" : u"\u0324" , "\xD8" : u"\u0332" , "\xD9" : u"\u0333" , "\xDA" : u"\u0329" , "\xDB" : u"\u032D" , "\xDD" : u"\uFE20" , "\xDE" : u"\uFE21" , " \xDF" : u"\uFE23" }] } def __init__ (self,sourceset) : self.sourceset = sourceset self.character = convertToUTF8.set[sourceset][0] self.diacriticAfter = convertToUTF8.set[sourceset][1] def swap(self,s) : o = "" after = "" for i in s : if ord(i)>127 : if self.diacriticAfter.has_key(i) : after+= self.diacriticAfter[i].encode("utf-8") else: o += self.character.get(i,"").encode("utf-8") + after after="" else : o += i + after after = "" return o.replace("&","&").replace("<","<") if __name__ == "__main__" : data = """00453nam 22001451n 450 001002100000100004100021101000800062200003200070210001700102215001000119702002700129801003000156917003500186930008600221\x1eFRBNF300200490000001\x1e||\x1fa19970701d1890 u0frey0103 ba\x1e||\x1faeng\x1e||\x1faYorkshire in olden times...\x1e||\x1faLondon\x1fd1890\x1e||\x1faIn-16\x1e||\x1faAndrews\x1fbWilliam\x1f40360\x1e| 3\x1faFR\x1fbBnF\x1fc20040906\x1fgAFNOR\x1e||\x1f5759999999:30020049001001\x1faubbc\x1e||\x1f5759999999:30020049001001\x1faNP-141\x1fb759999999\x1fcTolbiac - Rez de jardin - Magasin\x1fdO\x1e\x1d """ test = ISO2709(data,charset="ISO5426") test = ISO2709(data,charset="ansel") test = ISO2709(data,charset="") test.showRecord() print "# A tag 200" print test.tagAsXML(["200"],element="title",attributes='xsi:lang="eng"') print "# A single subfield - c" print test.tagAsXML(["930"],subfields="c",element="location") print "# Group of tags" print test.tagAsXML(["200","215","702"],element="title",attributes='xsi:lang="eng"') print "# Try to print tag not present" print test.tagAsXML(["201"],element="title",attributes='xsi:lang="eng"',alternative='No Title') print "# Range 220/299 except 210" print test.rangeAsXML("200","299",butNot=["210"],element="title",attributes='xsi:lang="eng"',alternative='No Title') print "# Fixed tag" print test.fixedTagAsXML("100",position=[9],length=5,element="date",alternative="Error") print "# Merged tags" print test.mergeTagsAsXML (tagList = [["T","200","a","",[],"Title %s,"], ["F","100", "" , "" , [9,14] ,"(%s)" ]], element="dcterms.bibliographicCitation", attributes="" , alternative = "Fred" ) print "SUTRS test ++++++++++++++++++++++++++++++++++" SUTRSdata = """Author(s):\t\tWilson, C. C.\nArticle Title:\t\tHydrogen atoms in acetylsalicylic acid (Aspirin): the librating methyl group and probing the potential well in the hydrogen-bonded dimer\nJournal Title:\t\tCHEMICAL PHYSICS LETTERS\nPublisher:\tELSEVIER SCIENCE\nYear:\t\t2001\nPages:\t\t57-63\nISSN:\t\t0009-2614\nVolume/Issue:\t\tVOL 335; NUMBER 1-2\nAbstract:\t\tThe structure of acetylsalicylic acid (2-(acetoyloxy)benzoic acid; Aspirin) has been studied by variable temperature single crystal neutron diffraction. The usual large torsional librational motion of the terminal methyl group is observed and its temperature dependence analysed using a simple model for the potential, yielding the force constant and barrier height for this motion. In addition, asymmetry of the scattering density of the proton involved in the hydrogen bond forming the carboxylic acid dimer motif is observed at temperatures above 200 K. This asymmetry is discussed in terms of its possible implications for the shape of the hydrogen bonding potential well.\nLanguage:\t\tEnglish\nShelfmark:\t\t3148.830000\nDewey Class No:\t530 540\nLC Class:\tQC QD\nCopyright Fee:\tGBP18.00\nDate Of Addition:\t041028\nFrequency:\tWeekly\nConference Sponsors:\t1\nLoan Embargo Date:\t100103\nID Number:\t\tRN000046980\nDate Of Modification:\t20041028 """ test = SUTRS(SUTRSdata) test.showRecord() print "* Single label " print test.labelAsXML([["Article Title",0,0,"%s"]], element="dc:title") print "* Part label " print test.labelAsXML([["ID Number",2,0,"** %s **"],], element="dc:identifier", attributes='xsi:type="tel:request"') print "* Mutiple label " print test.labelAsXML([["Journal Title",0,0,"%s -"], ["Volume/Issue",0,0," %s"], ["Pages",0,0,", %s"], ["Year",0,0," (%s)"]], element="dcterms:bibliographicCitation") print "+ Present test " print test.present("Title", TrueData = "Title is present" , FalseData = "Title not present" ) print test.present("Article Title", TrueData = "Article Title is present" , FalseData = "Not present" )