#!/usr/bin/env python # # NOTE - This script requires Python version 2.0 or greater. # Replace "python" with "python2" above if necessary! # # # patent.py - Get patent image from USPTO and convert to pdf. # # usage: patent.py patent_number # # # Copyright (c) 2002, 2006 Everett Lipman # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # The GNU General Public License is available at # # http://www.fsf.org/copyleft/gpl.html # # Alternatively, you can write to the # # Free Software Foundation, Inc. # 59 Temple Place - Suite 330 # Boston, MA 02111-1307, USA # # # How to use this script: # # Change the WORKINGDIR variable below to the name of a # directory where the script can safely create files of # the form 'patent_number.P' and 'patent_number.ps'. # 'patent_number' is the number you specified on the # command line, and 'P' is a page number. The final # output will be in 'patent_number.pdf' # # You must have working versions of ps2pdf and tiff2ps # available. Specify the locations of these programs # in the variables PSPDF and TIFFPS below. ps2pdf comes # with Ghostscript, and tiff2ps comes with libtiff. # # This script is a kludge, and is guaranteed to break # if the USPTO changes their web interface. Consider # yourself lucky if it isn't already broken. # # 31 July 2002 Everett Lipman # # 04May06 Updated to work with modified USPTO interface # ############################################################################### import sys import os import urllib import string ############################################################################### def usage(message = ''): sys.stdout = sys.stderr if message != '': print message print '\nusage: %s patent_number\n' % progname # 'patent_number' is the patent to get. sys.exit(1) N_ARGUMENTS = (1,) ############################################################################### WORKINGDIR = os.getenv('HOME') + '/tmp' PSPDF = '/usr/bin/ps2pdf' TIFFPS = '/usr/bin/tiff2ps' TIFFPS_OPTS= '-p1 -w 8.5 -h 11' ############################################################################### def get_id(patentnum): """Get ID and server name as if we were using the web interface. returns: a two-element list containing the idkey and server name. """ URL = 'http://patimg1.uspto.gov/.piw?Docid=%d&idkey=NONE' % patentnum server = 'patimg1' connection = urllib.urlopen(URL) page = connection.read() idindex = string.find(page, 'IDKey=') idkey = page[idindex:idindex+18] connection.close() print 'idkey: %s' % idkey print 'server: %s' % server return [idkey, server] ############################################################################### def get_page(patentnum, pagenum, paramlist): """Get patent page. get_page(patentnum, pagenum, paramlist) patentnum: patent number pagenum: page number paramlist: [idkey, server] returns: a string containing the tiff file for this page, or an empty string if the page doesn't exist. """ URL = 'http://' URL = URL+ paramlist[1] URL = URL + '.uspto.gov/.DImg?Docid=US' URL = URL + '%09d' % patentnum URL = URL + '&PageNum=' + '%d' % pagenum URL = URL + '&%s' % paramlist[0] URL = URL + '&ImgFormat=tif' connection = urllib.urlopen(URL) page = connection.read() connection.close() if string.find(page, '\x48\xb2\x61\x6d\x08\x86\x1a\x0c\x1a\x06') == -1: return page else: return '' ############################################################################### if __name__ == '__main__': nargs = len(sys.argv) - 1 progname = os.path.basename(sys.argv[0]) flag = 1 if nargs != 0 and N_ARGUMENTS[-1] == '*': flag = 0 else: for i in N_ARGUMENTS: if nargs == i: flag = 0 if flag: usage() print print 'Changing to working directory %s...' % WORKINGDIR, os.chdir(WORKINGDIR) print 'done.' print patno = int(sys.argv[1]) print 'Connecting to USPTO...' paramlist = get_id(patno) pages = [] pagelist = [] counter = 1 while 1: pagestring = get_page(patno, counter, paramlist) if pagestring == '': break print 'Got page %d.' % counter pages.append(pagestring) counter = counter + 1 for i in range(len(pages)): pagefile = '%d.%d' % (patno, i+1) pagelist.append(pagefile) outfile = open(pagefile, 'w') outfile.write(pages[i]) outfile.flush() outfile.close() print 'Converting to PostScript...', command = '%s %s' % (TIFFPS, TIFFPS_OPTS) for i in pagelist: command = command + ' %s' % i command = command + ' > %d.ps' % patno os.popen2(command) os.wait() print 'done.' print 'Creating pdf file...', command = '%s %d.ps' % (PSPDF, patno) os.popen2(command) os.wait() print 'done.' print 'Cleaning up temporary files...', command = 'rm' for i in pagelist: command = command + ' %s' % i command = command + ' %d.ps' % patno os.popen2(command) os.wait() print 'done.' print print 'Wrote file: %s/%d.pdf' % (WORKINGDIR, patno) print