#!/usr/bin/env python from copyright.lochttp import locHTTP from feedhelpers import newChild, rssDate, atomDate from filehelpers import last from copyright.marctools import locToUTF8 import email.Utils, os, re, string, sys, time import cElementTree as ElementTree from BeautifulSoup import BeautifulStoneSoup def formatText(text): # put around list labels matching regex # NOTE: we are not creating tags as XML elements, since what we # actually want is entity-encoded HTML, per # http://www.rssboard.org/rss-encoding-examples labels = re.compile(r'^(\w[^:]+:)', re.M) pbreak = re.compile(r'\n\s*\n') tl = re.split(labels, text.lstrip()) strings = [] for i, str in enumerate(tl): if len(str) != 0: if i % 2 == 0: str = re.sub(pbreak, '

', str.strip()) strings.append('
'+str+'
') else: strings.append('
'+str.strip()+'
') return '
'+string.join(strings, '')+'
' if __name__ == '__main__': usage = """ USAGE: %s """ % os.path.basename(sys.argv[0]) # put feed .xml files in ./ feed_dir = '' # put .mrc files in ./copyright/current mrc_dir = 'copyright/current' # put missing_hids.log here log_dir = '../' # number of records to fetch / publish in feed num_records = 1000 # determine HID of last hid_xxxx.mrc or hid_xxxx-xxxx.tgz file saved prev_hids = [] numpattern = re.compile(r'([0-9]+)\.[^\.]+$') for path in last('./copyright', 'hid_*'): n = numpattern.findall(path) if len(n) == 1: prev_hids.append(int(n[0])) if len(prev_hids) == 0: sys.exit('There are no HID files saved at all, refusing to start at HID #1.') # First HID to grab is last HID saved + 1 prev_hids.sort() first = prev_hids.pop() + 1 del prev_hids # Create loc object and start querying the LOC loc = locHTTP() # Because there are often gaps amongst just-added records, only add 1000 # more records if there are at least 2000 new records available if loc.getHID(first + (2 * num_records) - 1) is not None: last = first + num_records - 1 else: sys.exit() text = loc.getHIDs(first, last, 0, None, 0, 'dict') marc = loc.getHIDs(first, last, 0, None, 2, 'dict') hid = range(first, first + len(text)) # Reverse order hid list, since for feed purposes we want descending order hid.reverse() # Common feed elements feed_title = "copyright.gov feed by public.resource.org" feed_link = "http://public.resource.org/" feed_copyrights = "public domain" feed_imageurl = "http://public.resource.org/seal.head.jpg" feed_description = string.join([ "Latest registrations in the United States Copyright Office ", "Catalog of Registrations (1978-present). For bulk data to 1978, see ", "http://bulk.resource.org/copyright/hids/ . There is also a daily bulk ", "feed of each day's new records, go to http://rss.resource.org/ for ", "this and our other feeds."], '') now_tstamp = time.time() # RSS channel elements now_rss = rssDate(now_tstamp) # RFC 822 rss = ElementTree.Element("rss") rss.set("version", "2.0") rss_c = ElementTree.SubElement(rss, "channel") newChild(rss_c, "title", feed_title) newChild(rss_c, "link", feed_link) newChild(rss_c, "description", feed_description) newChild(rss_c, "copyright", feed_copyrights) newChild(rss_c, "pubDate", now_rss) newChild(rss_c, "lastBuildDate", now_rss) newChild(rss_c, "docs", "http://www.rssboard.org/rss-specification") newChild(rss_c, "ttl", "5") rss_im = newChild(rss_c, "image") newChild(rss_im, "url", feed_imageurl) newChild(rss_im, "title", "The Seal of Approval") newChild(rss_im, "link", "http://public.resource.org/") # Atom channel elements now_atom = atomDate(now_tstamp) # RFC 3339 url_atom = 'http://rss.resource.org/copyright.atom.xml' atom = ElementTree.Element("feed") atom.set("xmlns", "http://www.w3.org/2005/Atom") newChild(atom, "title", feed_title) newChild(atom, "link", None, {'title':'public.resource.org homepage','rel':'related','href':feed_link}) newChild(atom, "link", None, {'title':'this feed','rel':'self','href':url_atom}) atom_a = newChild(atom, "author") newChild(atom_a, "name", "public.resource.org") newChild(atom_a, "uri", feed_link) newChild(atom, "id", url_atom) newChild(atom, "updated", now_atom) newChild(atom, "logo", feed_imageurl) newChild(atom, "icon", feed_imageurl) newChild(atom, "rights", feed_copyrights) newChild(atom, "subtitle", feed_description) # Add items and save MARC records to filesystem title_pat = re.compile(r'Title:\s+((.+)(\n.*\S.+)*)\n[\r\f\v\t ]*\n') ws_pat = re.compile(r'\s{2,}') # Simple 8-bit character search/replace #textblob = textblob.replace('\xc2\xa9', '\u00a9') # copyright symbol #textblob = textblob.replace('\xe2\x84\x97', '\u2117') # copyright sound recording symbol #textblob = unicode(textblob, 'raw-unicode-escape') # Better 8-bit character fix, converts textblob to unicode charConverter = locToUTF8() for i, hid in enumerate(hid): if text[hid] is None or marc[hid] is None: # Nothing for this HID yet, log it to missing_hids.log fh = open(log_dir+'missing_hids.log', 'a') print "Missing HID %d skipped." % hid fh.write(str(hid)+'\n') fh.close() else: unitext = charConverter.replace(text[hid]) # Decode any HTML entities in textblob unitext = unicode(BeautifulStoneSoup(unitext, convertEntities = BeautifulStoneSoup.HTML_ENTITIES)) title = re.findall(title_pat, unitext) if len(title) > 0: title = re.sub(ws_pat, ' ', title[0][0]).strip() else: title = str(hid) # Common item vars item_filename = mrc_dir+"/hid_%08d.mrc" % hid item_link = "http://rss.resource.org/" + item_filename item_html_text = formatText(unitext) item_tstamp = now_tstamp - (i * .0625) # RSS item rss_it = newChild(rss_c, "item") newChild(rss_it, "title", title) newChild(rss_it, "link", item_link) rss_it_desc = newChild(rss_it, "description", item_html_text) newChild(rss_it, "guid", item_link) # Atom entry atom_ent = newChild(atom, "entry") newChild(atom_ent, "title", title) newChild(atom_ent, "id", item_link) newChild(atom_ent, "updated", atomDate(item_tstamp)) newChild(atom_ent, "content", item_html_text, {'type':'html'}) newChild(atom_ent, "link", None, {'title':'Record in Latin-1 MARC format', 'type':'application/marc', 'rel':'alternate', 'href':item_link}) # Save txt of MARC record fh = open(item_filename, 'w') fh.write(marc[hid]) fh.close() rss_tree = ElementTree.ElementTree(rss) rss_tree.write("copyright.rss.xml", encoding="utf-8") atom_tree = ElementTree.ElementTree(atom) atom_tree.write("copyright.atom.xml", encoding="utf-8")