# -*- coding: utf-8 -*- """Routines for scraping data about parts from Farnell""" from urllib import urlopen import string, sgmllib, sys, re, os, hashlib, time from decimal import Decimal # Number of seconds for the cache to last for CACHE_LIFE = 36000 def grab_url_cached(url): cache_dir = os.path.expanduser( "~/.sr/cache/farnell" ) if not os.path.exists( cache_dir ): os.makedirs( cache_dir ) h = hashlib.sha1() h.update(url) F = os.path.join( cache_dir, h.hexdigest() ) if os.path.exists( F ) and (time.time() - os.path.getmtime( F )) < CACHE_LIFE: f = open( F, "r" ) page = f.read() f.close() else: page = urlopen(url).read() f = open( F, "w" ) f.write(page) f.close() return page class Item(sgmllib.SGMLParser): "Represents a Farnell item" def __init__(self, partNumber, verbose=0): "Initialise an object, passing 'verbose' to the superclass." sgmllib.SGMLParser.__init__(self, verbose) self.qty_range = [] self.cost = [] self.inside_td_element = 0 self.inside_p_element = 0 self.inside_b_element = 0 self.last_data = '' self.qty = True self.qty_str = "" self.last_qty = None self.prices = [] self.feed(self.__getData(partNumber)) self.close() def __getData(self, partNumber): page = grab_url_cached( 'http://xgoat.com/p/farnell/'+str(partNumber) ) start = string.find(page, '
') if start == -1: raise Exception( """Part number "%s" doesn't exist""" % str(partNumber) ) info = page[start:] end = string.find(info, '
') availInfo = info[:end] start = string.find(page, '
') if start == -1: raise Exception( """Part number "%s" doesn't exist""" % str(partNumber) ) info = page[start:] end = string.find(info, '
') priceInfo = info[:end] return availInfo+priceInfo def start_td(self, attributes): "Process a table div." self.inside_td_element += 1 def end_td(self): "Record the end of a table div." self.inside_td_element -= 1 def start_p(self, attributes): "Process a paragraph." self.inside_p_element += 1 def end_p(self): "Record the end of a paragraph." self.inside_p_element -= 1 def start_b(self, attributes): "Process a bold." self.inside_b_element += 1 def end_b(self): "Record the end of a bold." self.inside_b_element -= 1 def handle_data(self, data): "Handle the textual 'data'." data = data.replace('\n', '').replace(':', '') if data.replace(' ', '') == '': return if self.inside_td_element > 0: # print 'td:"'+data+'"' if "£" in data: # print "\tQTY_STR: \"%s\"" % self.qty_str # print "\tPRICE: \"%s\"" % data self._add_price_range( self.qty_str, data[2:] ) self.qty_str = "" else: self.qty_str += data elif self.inside_b_element > 0: # print 'b:"'+data+'"' self.last_data = data elif self.inside_p_element > 0: # print 'p:"'+data+'"' #kill off the last_data, but store it just in case tmp_last_data = self.last_data self.last_data = '' #test for a match to last_data if tmp_last_data == 'Price For': self.price_for = self._parse_price_for(data) elif tmp_last_data == 'Minimum Order Quantity': self.min_order = int(data) elif tmp_last_data == 'Order Multiple': self.multi = int(data) elif tmp_last_data == 'Availability': if data.isdigit(): self.avail = int(data) else: self.avail = 0 else: #not this time around self.last_data = tmp_last_data def _parse_price_for(self, s): "Break the 'price for' string up" r = re.compile( "Reel of ([0-9,]+)" ) m = r.search( s ) if m != None: # Strip commas n = m.group(1).replace(",","") return int(n) r = re.compile( "Pack of ([0-9,]+)" ) m = r.search( s ) if m != None: # Strip commas n = m.group(1).replace(",","") return int(n) r = re.compile( "([0-9,]+) Each" ) m = r.search( s ) if m != None: # Strip commas n = m.group(1).replace(",","") return int(n) print """Warning: Farnell script can't parse price_for field "%s".""" % s def _add_price_range(self, qty, cost): # print "_add_price_range( qty = \"%s\", cost = \"%s\" )" % (qty, cost) q = self._parse_qty(qty) c = self._parse_cost(cost) if q == None: return # print "\tq: %i" % q # print "\tc: %s" % c self.prices.append( (q,c) ) def _parse_qty(self, qty): r = re.compile( "([0-9,]+)\s*-\s*([0-9,]+)" ) m = r.search(qty) if m != None: # Strip commas t = int(m.group(2).replace(",","")) # Only use the higher end of the range return t r = re.compile( "([0-9]{1}[0-9,.]*)" ) m = r.search(qty) if m != None: # Strip commas t = int(m.group(1).replace(",","")) return t print """Warning: Farnell script can't parse quantity field: "%s".""" % qty def _parse_cost(self, cost): r = re.compile( "([0-9]{1}[0-9,.]*)" ) m = r.search(cost) if m != None: # Strip commas t = m.group(1).replace(",","") return Decimal(t) def get_info(self): "Return a dict of the info garnered." return dict(qty=self.qty_range, price=self.cost, num_for_price=self.price_for, min_order=self.min_order, multiple=self.multi, number_available=self.avail) def print_info(self): "Print a the info garnered in a nice way." print ' Number Available:',self.avail print ' Price For:',self.price_for print ' Minimum Order Quantity:',self.min_order print ' Order Multiple:',self.multi print ' Pricing:' n = self.min_order for p in self.prices: if n != p[0]: print "\t%i - %i: \t£%s" % (n, p[0], p[1]) n = p[0] + 1 else: print "\t%i +: \t£%s" % (n, p[1])