自己写一个爬虫 copider
copider 模仿scrapy的一些写法,当然我这个是单进程的,不是异步的
1.目录 copider/copider.py
#coding=utf-8 ''' Created on 2015年10月8日 @author: snt1 ''' import urllib2 import lxml.html import StringIO class Spider(object): def __init__(self, url, meta=None): self.URL = url self.META = meta self.TEXTMARK = self.get(url) self.SEL = self.selector(doc=self.TEXTMARK) def get(self, url): try: req = urllib2.Request(url) req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.155 Safari/537.36') shtml = urllib2.urlopen(req, timeout=15).read() except Exception, e: print e,"...next.." data = StringIO.StringIO(shtml) HTML = data.read() return(HTML) # 返回html @property def html(self): return self.TEXTMARK @property def url(self): return self.URL @property def meta(self): return self.META def selector(self, doc=None): if doc: HTML = doc else: HTML = self.HTML return lxml.html.fromstring(HTML) def xpath(self, rule): iter_list = self.SEL.xpath(rule) attrList = [] try: for ele in iter_list: attrList.append(ele.attrib) #attrList.append(ele.attrib) return attrList except Exception, e: return iter_list def Request(url, func, **meta): if meta: response=Spider(url,meta['meta']) else: response=Spider(url) func(response)
2.copider/aero.py
#coding=utf-8 ''' Created on 2015年10月8日 @author: snt1 ''' import re import time from copider import Spider, Request class AeroCopider(object): name = "aero" storeId = "554b14c97b010cc731e81b35" # 站点ID allowed_domains = ["www.xxxx.com"] root_url = 'http://www.xxxx.com' category_url = root_url + '/category/index.jsp?numResultsPerPage=100&categoryId=%s' cap_category_url = root_url + '/family/index.jsp?categoryId=%s&page=%d&numResultsPerPage=100' url_dicts = {'3534623':'Girls', '3534624':'Guys'} def __init__(self): self.start_urls() def start_urls(self): for fid in self.url_dicts.keys(): url = self.category_url %fid response = Spider(url) node_a = response.xpath('//*[@id="sidebar-left"]/div/dl[2]/dd//dt/a/@href') node_text = response.xpath('//*[@id="sidebar-left"]/div/dl[2]/dd//dt/a/text()') url_list, cid_list = [],[] for num, preparing in enumerate(node_a): parttern = re.compile(r'family.jsp\?categoryId=') if parttern.search(preparing): chd_url = self.root_url+preparing pattern_sub = re.compile('&cp=.*?$') chd_url = pattern_sub.sub('', chd_url, re.S|re.I|re.M) pattern_fin = re.compile(r'family.jsp\?categoryId=(\d+)') cid = pattern_fin.findall(chd_url)[0] url_list.append(chd_url) cid_list.append(cid) print(u'产品分类链接:%s -> %s' %(node_text[num], chd_url)) cateid = cid_list[num] Request(chd_url, self.parse_page, meta={'cateid':cateid}) print def parse_page(self, response): #total_page = response.xpath('//div[@class="pagination"]/ul/li/a[@rel="nofollow"]/text()') total_items = int(response.xpath('//*[@id="main-wrap"]//li[@class="count"]/span/text()')[0]) mod, rem = divmod(total_items, 100) if mod > 1: if rem > 0: mod += 1 else: mod = 1 total_page = mod print(u'产品总分页数: %s -> %s' %(total_page,response.url)) cateid = response.meta['cateid'] for page in range(1, total_page+1): url = self.cap_category_url %(cateid, page) Request(url, self.parse_product) def parse_product(self, response): product = response.xpath('//*[@id="products"]//h4/a/@href') print(u'以下来自哪个页面:%s' %response.url) print(u'产品:%s个 -> 路径:%s' %(len(product), product)) if __name__ == '__main__': AeroCopider()