晋江年下文爬取【xpath】
''' @Modify Time @Author 目标:晋江年下文 爬取6页 ------------ ------- http://www.jjwxc.net/search.php?kw=%C4%EA%CF%C2&t=1&p=1 2019/8/31 15:19 laoalo ''' import requests from lxml import etree head = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134', 'Host':'www.jjwxc.net', } def get_page_detail(url): # 得到当前页面中每本书的url response = requests.get(url=url,headers=head,timeout=50).text book = etree.HTML(response).xpath('//h3[@class="title"]/a/@href') # print(etree.tostring(book,encoding='gbk').decode('gbk')) return book def get_book_detial(book_url): # 得到每本书的详细信息 try: response = requests.get(url=book_url, headers=head, timeout=50).text book_detail = etree.HTML(response) book = {} title = book_detail.xpath("//span[@itemprop='articleSection']//text()")[0].encode('ISO-8859-1').decode('gbk') book['title'] = title author = book_detail.xpath("//span[@itemprop='author']//text()")[0].encode('ISO-8859-1').decode('gbk') book['author'] = author information = book_detail.xpath("string(//div[@id='novelintro'])").encode('ISO-8859-1').decode('gbk') book['information'] = information return book except IndexError as e: print(e,'下标越界') # targets = book_detail.xpath("//text()") # for index,target in enumerate(targets): # print(index,'*'*30,target.encode('ISO-8859-1').decode('gbk')) # ''' # 标签爬不出来 # ''' def spider(): bookshelf = [] for i in range(1,5): print("这是第{index}页的信息\n\n\n".format(index=i)) url = 'http://www.jjwxc.net/search.php?kw=%C4%EA%CF%C2&t=1&p={page_num}'.format(page_num=i) book_list = get_page_detail(url) for i in book_list: # print(get_book_detial(i)) bookshelf.append(get_book_detial(i)) return bookshelf # print(url) if __name__ == '__main__': # print(get_book_detial("http://www.jjwxc.net/onebook.php?novelid=3402626")) print(spider())
时常会有:"list index out of range 下标越界",或是"TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败",有的师傅说是因为访问过快,导致list的赋值没附上……代码有待优化