#!/usr/bin/env python from feedhelpers import newChild, rssDate, atomDate from filehelpers import all_files import email.Utils, os, re, string, sys, time import cElementTree as ElementTree if __name__ == '__main__': usage = """ USAGE: %s """ % os.path.basename(sys.argv[0]) # put feed .xml files in ./ feed_dir = '' # tarballs are in ./copyright/bulk tgz_dir = 'copyright/bulk' # number of tarballs to list in feed items_to_show = 50 path_list = [] for path in all_files(tgz_dir, 'hid*tgz'): path_list.append(path) if len(path_list) == 0: sys.exit() # Reverse order of files, so newest are first path_list.reverse() # Common feed elements feed_title = "copyright.gov bulk feed by public.resource.org" feed_link = "http://public.resource.org/" feed_copyrights = "public domain" feed_imageurl = "http://public.resource.org/seal.head.jpg" feed_description = string.join([ "Latest registrations in the United States Copyright Office ", "Catalog of Registrations (1978-present). For bulk data to 1978, see ", "http://bulk.resource.org/copyright/hids/ . There is also an ", "up-to-the-minute feed of new records as they are added, go to ", "http://rss.resource.org/ for this and our other feeds."], '') now_tstamp = os.path.getmtime(path_list[0]) # RSS channel elements now_rss = rssDate(now_tstamp) # RFC 822 rss = ElementTree.Element("rss") rss.set("version", "2.0") rss_c = ElementTree.SubElement(rss, "channel") newChild(rss_c, "title", feed_title) newChild(rss_c, "link", feed_link) newChild(rss_c, "description", feed_description) newChild(rss_c, "copyright", feed_copyrights) newChild(rss_c, "pubDate", now_rss) newChild(rss_c, "lastBuildDate", now_rss) newChild(rss_c, "docs", "http://www.rssboard.org/rss-specification") newChild(rss_c, "ttl", "5") rss_im = newChild(rss_c, "image") newChild(rss_im, "url", feed_imageurl) newChild(rss_im, "title", "The Seal of Approval") newChild(rss_im, "link", "http://public.resource.org/") # Atom channel elements now_atom = atomDate(now_tstamp) # RFC 3339 url_atom = 'http://rss.resource.org/copyright.bulk.atom.xml' atom = ElementTree.Element("feed") atom.set("xmlns", "http://www.w3.org/2005/Atom") newChild(atom, "title", feed_title) newChild(atom, "link", None, {'title':'public.resource.org homepage','rel':'related','href':feed_link}) newChild(atom, "link", None, {'title':'this feed','rel':'self','href':url_atom}) atom_a = newChild(atom, "author") newChild(atom_a, "name", "public.resource.org") newChild(atom_a, "uri", feed_link) newChild(atom, "id", url_atom) newChild(atom, "updated", now_atom) newChild(atom, "logo", feed_imageurl) newChild(atom, "icon", feed_imageurl) newChild(atom, "rights", feed_copyrights) newChild(atom, "subtitle", feed_description) # Add items items_shown = 0 for path in path_list: items_shown = items_shown + 1 if items_shown > items_to_show: break # Common item vars item_filename = os.path.basename(path) item_link = "http://rss.resource.org/" + path item_desc = "Archive of copyright records from copyright.gov: " + item_filename item_tstamp = os.path.getmtime(path) item_size = str(os.path.getsize(path)) # RSS item rss_it = newChild(rss_c, "item") newChild(rss_it, "title", item_filename) newChild(rss_it, "link", item_link) newChild(rss_it, "enclosure", None, {'url':item_link, 'length':item_size, 'type':'application/x-tar-gz'}) rss_it_desc = newChild(rss_it, "description", item_desc) newChild(rss_it, "guid", item_link) newChild(rss_it, "pubDate", rssDate(item_tstamp)) # Atom entry atom_ent = newChild(atom, "entry") newChild(atom_ent, "title", item_filename) newChild(atom_ent, "id", item_link) newChild(atom_ent, "updated", atomDate(item_tstamp)) newChild(atom_ent, "content", item_desc, {'type':'text'}) newChild(atom_ent, "link", None, {'title':'Tarball of copyright records in Latin-1 MARC format', 'type':'application/x-tar-gz', 'rel':'enclosure', 'length':item_size, 'href':item_link}) rss_tree = ElementTree.ElementTree(rss) rss_tree.write("copyright.bulk.rss.xml", encoding="utf-8") atom_tree = ElementTree.ElementTree(atom) atom_tree.write("copyright.bulk.atom.xml", encoding="utf-8")