#!/usr/bin/env python """ Let's try to get some data from copyright.gov Public Domain 2007 public.resource.org Description: Author: Joel Hardi Usage examples at the bottom of this script. """ import time, re, urllib, urllib2 from httplib import HTTPException class locHTTP(object): "connect and retrieve stuff from the loc via http" # our HTTP headers headers = { "User-Agent" : "public.resource.org PublicRecordBot 1.1" } # PID we retrieve from loc, timestamp when it was retrieved, its max age pid = None pid_timestamp = None pid_maxlife = 900 # count of consecutive http errors and the max we will allow before raising exception and exiting http_errs = 0 http_errs_max = 10 def getPID(self): "Accessor method, returns current valid PID. Refreshes PID if it's too old." if self.pid is None: locHTTP._fetchPID(self) # PID timeout after pid_maxlife seconds if (time.time() - self.pid_timestamp) >= self.pid_maxlife: locHTTP._fetchPID(self) return self.pid def saveRecord(self, reg_no): "Save a record given a registration number" url = 'http://cocatalog.loc.gov/cgi-bin/Pwebrecon.cgi' args = {'Search_Arg':reg_no, 'Search_Code':'REGS', 'SL':'', 'CNT':'25', 'ti':'1%2C1', 'SID':'7', 'RID':'13023929', 'BIB':'13023929', 'SEQ':'20070926210221', 'RD':'2', 'SAVE':'Format+for+Print%2FSave', 'MAILADDY':'', 'EMAILADDRESS':'None'} data = urllib.urlencode(args) content = locHTTP._fetchURL(self, url, data) fh = open(reg_no+'.txt', 'w') fh.write(content) fh.close() print "Saved "+reg_no+".txt" def saveHIDs(self, first, last, skip = 0, chunksize = 300): "Save a block of records given a range first-last of HID numbers, optional skip offset to start off somewhere in the middle of the range, and an optional number of records to fetch at a time (300 seems to be about the practical maximum)" next = None for i in range(first+skip, last+2, chunksize): if next: locHTTP._saveHIDSet(self, next, i-1) next = i # Handle any remainder mod = (last - first + 1) % chunksize if mod: locHTTP._saveHIDSet(self, last-mod+1, last) def _saveHIDSet(self, first, last): "Save a single block of records given a range of HID numbers" cnt = last - first hidchk = '' for i in range(first, last+1): hidchk = hidchk+'&HID=%08d&CHK=%08d' % (i,i) url = 'http://cocatalog.loc.gov/cgi-bin/Pwebrecon.cgi?PostSearchSortBy1=NULL'+hidchk+'&PostSearchSortBy2=NULL&SAB1=2007+01+03&BOOL1=as+a+phrase&FLD1=Reg+Number%%2FDoc+Number+%%28K017%%29+%%28K017%%29&GRP1=OR+with+next+set&CNT=%d&SEQ=20070927220342&REC=0&RD=2&SAVE=Format+for+Print%%2FSave&RC=0&MAILADDY=&EMAILADDRESS=None&LIMITBUTTON=0' % cnt content = locHTTP._fetchURL(self, url) if content is not None: filename = 'hid_%08d-%08d.txt' % (first, last) fh = open(filename, 'w') fh.write(content) fh.close() print "Saved HIDs %d-%d to %s" % (first, last, filename) else: self.http_errs = self.http_errs + 1 print "No content for %d-%d, retrying with new PID" % (first, last) time.sleep(60) # Refresh PID locHTTP._fetchPID(self) # Try again return locHTTP._saveHIDSet(self, first, last) def _fetchURL(self, url, data = None, appendPID = True): "Wraps urllib2.urlopen() calls, appends PID, tries to recover from some loc server errors. Hard fails after X number of http_errs." max_errors = self.http_errs_max try: senddata = data sendurl = url if appendPID is True: # getPID may call _fetchPID, which makes a recursive call to _fetchURL since it uses HTTP pid = locHTTP.getPID(self) if data is not None: senddata = data + '&PID=%s' % pid else: sendurl = url + '&PID=%s' % pid if data is not None: req = urllib2.Request(sendurl, senddata, self.headers) else: req = urllib2.Request(sendurl) response = urllib2.urlopen(req) content = response.read() self.http_errs = 0 return content except (HTTPException), e: self.http_errs = self.http_errs + 1 print e print '(Number consecutive HTTP errors: %d)' % self.http_errs if self.http_errs < max_errors: time.sleep(60) # Refresh PID if self.http_errs == (max_errors / 2): locHTTP._fetchPID(self) # Try again (recurse) locHTTP._fetchURL(self, url, data, appendPID) else: raise RuntimeError, "%d consecutive HTTP errors reached" % max_errors def _fetchPID(self): "Private method to fetch new PID from loc" url = 'http://cocatalog.loc.gov/cgi-bin/Pwebrecon.cgi?DB=local&PAGE=First' content = locHTTP._fetchURL(self, url, None, False) # BeautifulSoup bonks on this slag HTML so I am just going to regex it m = re.search(']+pid[^>]+value=("|\')?([0-9]+)', content, re.I) self.pid = m.group(2) self.pid_timestamp = time.time() # # # Some tests and examples if __name__ == '__main__': loc = locHTTP() # Get a PID (not actually necessary to explicitly do this, the # other class methods will fetch a PID when they think they need to. print "Let's get a PID" pid = loc.getPID() print "Our PID is %s" % pid # Save a single record from a registration number reg = "SR0000301270" print "Let's save record with registration number %s" % reg loc.saveRecord(reg) # Save 1000 records, those with HIDs 1000-1999 first = 1000 last = 1999 print "Let's save records with HIDs from %d to %d" % (first, last) loc.saveHIDs(first, last) # Print the current PID pid = loc.getPID() print "Our PID is now %s" % pid # Now test PID expiry, should print a different PID from that just printed print "Now setting it to expire after 10 seconds, sleeping 10, then doing getPID again" loc.pid_maxlife = 10 time.sleep(10) pid = loc.getPID() print "Our PID is now %s" % pid