""" MARC utlities Public Domain 2007 public.resource.org Author: Joel Hardi """ class locToUTF8(object): "Changes text from LOC into unicode, using replace() method" dict = {} charmap = {} def __init__(self): "Sets self.dict and search character index self.charmap" self.dict = { "\x20":"\u0020", # "HARD SPACE - represented by a space" "\xC2\xA1":"\u00A1", # "INVERTED EXCLAMATION MARK" "\xC2\xA3":"\u00A3", # "BRITISH POUND / POUND SIGN" "\xC2\xA9":"\u00A9", # "COPYRIGHT SIGN" "\xC2\xAE":"\u00AE", # "PATENT MARK / REGISTERED SIGN" "\xC2\xB0":"\u00B0", # "DEGREE SIGN" "\xC2\xB1":"\u00B1", # "PLUS OR MINUS / PLUS-MINUS SIGN" "\xC2\xB7":"\u00B7", # "MIDDLE DOT" "\xC2\xBF":"\u00BF", # "INVERTED QUESTION MARK" "\xC3\x86":"\u00C6", # "UPPERCASE DIGRAPH AE / LATIN CAPITAL LIGATURE AE" "\xC3\x98":"\u00D8", # "UPPERCASE SCANDINAVIAN O / LATIN CAPITAL LETTER O WITH STROKE" "\xC3\x9E":"\u00DE", # "UPPERCASE ICELANDIC THORN / LATIN CAPITAL LETTER THORN (Icelandic)" "\xC3\xA6":"\u00E6", # "LOWERCASE DIGRAPH AE / LATIN SMALL LIGATURE AE" "\xC3\xB0":"\u00F0", # "LOWERCASE ETH / LATIN SMALL LETTER ETH (Icelandic)" "\xC3\xB8":"\u00F8", # "LOWERCASE SCANDINAVIAN O / LATIN SMALL LETTER O WITH STROKE" "\xC3\xBE":"\u00FE", # "LOWERCASE ICELANDIC THORN / LATIN SMALL LETTER THORN (Icelandic)" "\xC4\x90":"\u0110", # "UPPERCASE D WITH CROSSBAR / LATIN CAPITAL LETTER D WITH STROKE" "\xC4\x91":"\u0111", # "LOWERCASE D WITH CROSSBAR / LATIN SMALL LETTER D WITH STROKE" "\xC4\xB1":"\u0131", # "LOWERCASE TURKISH I / LATIN SMALL LETTER DOTLESS I" "\xC5\x81":"\u0141", # "UPPERCASE POLISH L / LATIN CAPITAL LETTER L WITH STROKE" "\xC5\x82":"\u0142", # "LOWERCASE POLISH L / LATIN SMALL LETTER L WITH STROKE" "\xC5\x92":"\u0152", # "UPPERCASE DIGRAPH OE / LATIN CAPITAL LIGATURE OE" "\xC5\x93":"\u0153", # "LOWERCASE DIGRAPH OE / LATIN SMALL LIGATURE OE" "\xC6\xA0":"\u01A0", # "UPPERCASE O-HOOK / LATIN CAPITAL LETTER O WITH HORN" "\xC6\xA1":"\u01A1", # "LOWERCASE O-HOOK / LATIN SMALL LETTER O WITH HORN" "\xC6\xAF":"\u01AF", # "UPPERCASE U-HOOK / LATIN CAPITAL LETTER U WITH HORN" "\xC6\xB0":"\u01B0", # "LOWERCASE U-HOOK / LATIN SMALL LETTER U WITH HORN" "\xCA\xB9":"\u02B9", # "SOFT SIGN, PRIME / MODIFIER LETTER PRIME" "\xCA\xBA":"\u02BA", # "HARD SIGN, DOUBLE PRIME / MODIFIER LETTER DOUBLE PRIME" "\xCA\xBB":"\u02BB", # "AYN / MODIFIER LETTER TURNED COMMA" "\xCA\xBE":"\u02BE", # "ALIF / MODIFIER LETTER RIGHT HALF RING" "\xCC\x80":"\u0300", # "GRAVE / COMBINING GRAVE ACCENT (Varia)" "\xCC\x81":"\u0301", # "ACUTE / COMBINING ACUTE ACCENT (Oxia)" "\xCC\x82":"\u0302", # "CIRCUMFLEX / COMBINING CIRCUMFLEX ACCENT" "\xCC\x83":"\u0303", # "TILDE / COMBINING TILDE" "\xCC\x84":"\u0304", # "MACRON / COMBINING MACRON" "\xCC\x86":"\u0306", # "BREVE / COMBINING BREVE (Vrachy)" "\xCC\x87":"\u0307", # "SUPERIOR DOT / COMBINING DOT ABOVE" "\xCC\x88":"\u0308", # "UMLAUT, DIAERESIS / COMBINING DIAERESIS (Dialytika)" "\xCC\x89":"\u0309", # "PSEUDO QUESTION MARK / COMBINING HOOK ABOVE" "\xCC\x8A":"\u030A", # "CIRCLE ABOVE, ANGSTROM / COMBINING RING ABOVE" "\xCC\x8B":"\u030B", # "DOUBLE ACUTE / COMBINING DOUBLE ACUTE ACCENT" "\xCC\x8C":"\u030C", # "HACEK / COMBINING CARON" "\xCC\x90":"\u0310", # "CANDRABINDU / COMBINING CANDRABINDU" "\xCC\x93":"\u0313", # "HIGH COMMA, CENTERED / COMBINING COMMA ABOVE (Psili)" "\xCC\x95":"\u0315", # "HIGH COMMA, OFF CENTER / COMBINING COMMA ABOVE RIGHT" "\xCC\x9C":"\u031C", # "RIGHT CEDILLA / COMBINING LEFT HALF RING BELOW" "\xCC\xA3":"\u0323", # "DOT BELOW / COMBINING DOT BELOW" "\xCC\xA4":"\u0324", # "DOUBLE DOT BELOW / COMBINING DIAERESIS BELOW" "\xCC\xA5":"\u0325", # "CIRCLE BELOW / COMBINING RING BELOW" "\xCC\xA6":"\u0326", # "LEFT HOOK (COMMA BELOW) / COMBINING COMMA BELOW" "\xCC\xA7":"\u0327", # "CEDILLA / COMBINING CEDILLA" "\xCC\xA8":"\u0328", # "RIGHT HOOK, OGONEK / COMBINING OGONEK" "\xCC\xAE":"\u032E", # "UPADHMANIYA / COMBINING BREVE BELOW" "\xCC\xB2":"\u0332", # "UNDERSCORE / COMBINING LOW LINE" "\xCC\xB3":"\u0333", # "DOUBLE UNDERSCORE / COMBINING DOUBLE LOW LINE" "\xE2\x84\x93":"\u2113", # "SCRIPT SMALL L" "\xE2\x84\x97":"\u2117", # "SOUND RECORDING COPYRIGHT" "\xE2\x99\xAD":"\u266D", # "MUSIC FLAT SIGN" "\xE2\x99\xAF":"\u266F", # "MUSIC SHARP SIGN" "\xEF\xB8\xA0":"\uFE20", # "LIGATURE, FIRST HALF / COMBINING LIGATURE LEFT HALF" "\xEF\xB8\xA1":"\uFE21", # "LIGATURE, SECOND HALF / COMBINING LIGATURE RIGHT HALF" "\xEF\xB8\xA2":"\uFE22", # "DOUBLE TILDE, FIRST HALF / COMBINING DOUBLE TILDE LEFT HALF" "\xEF\xB8\xA3":"\uFE23", # "DOUBLE TILDE, SECOND HALF / COMBINING DOUBLE TILDE RIGHT HALF" } # build self.charmap to map each first char of a search string to a list of its search strings firstchars = [] self.charmap = {} for i in self.dict.iterkeys(): if firstchars.count(i[0]) == 0: firstchars.append(i[0]) self.charmap[i[0]] = [] self.charmap[i[0]].append(i) def replace(self, str): "Given string str, returns unicode string with correct character replcements" searchchars = [] # build subset of search/replace pairs to use based on if first char of search appears in str prev = range(0,3) for c in str: prev[0] = prev[1] prev[1] = prev[2] prev[2] = c if self.charmap.has_key(c): if searchchars.count(c) == 0: searchchars.append(c) elif ord(c) > 127 and prev.count(c) == 0: str = str.replace(c, '\\x%x' % ord(c)) # perform search/replaces for c in searchchars: for i in self.charmap[c]: str = str.replace(i, self.dict[i]) return unicode(str, 'raw-unicode-escape')