爬取猫眼电影TOP100
这次是第一次存储在MySQL中,所以选择了相对简单易爬取的猫眼电影排行,但还是出现了问题。当我在创建表的时候,整个程序突然报错了,查看了一下sql语句,发现并没有什么错误,这是我直观的感受,下面是我创建表失败的代码以及报错的问题:
sql = 'create table if not exists maoyan(index varchar(50) not null,title varchar(60),actor varchar(100),score varchar(20),img varchar(255),primary key(indexs))'
TypeError: %d format: a number is required, not str
一开始碰到这个问题真的是疯了,怎么都找不到原因。后面经过网上查找资料,就尝试将index改为indexs,说实话我到现在也不知道为什么要这样,如果有同学知道,请不吝赐教,最后就直接展示代码。
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 # !/usr/bin/env python 2 # encoding: utf-8 3 4 import requests 5 from lxml import etree 6 import csv,pymysql 7 import time 8 9 headers = { 10 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,' 11 'image/webp,image/apng,*/*;q=0.8', 12 'Connection': 'keep-alive', 13 'Host': 'maoyan.com', 14 'Referer': 'http://maoyan.com/board/', 15 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36' 16 } 17 18 def Get_HTML(url): 19 r = requests.get(url,headers=headers,) 20 r.raise_for_status() 21 r.encoding = r.apparent_encoding 22 #print(r.text) 23 return r.text 24 25 def Parse_HTML(html,info): 26 s = etree.HTML(html) 27 divs = s.xpath('//*[@id="app"]/div/div/div[1]/dl/dd') 28 for div in divs: 29 ilt = {} 30 31 ilt['indexs'] = div.xpath('./i/text()')[0] 32 #print(type(ilt['indexs'])) 33 34 ilt['title'] = div.xpath('./div/div/div[1]/p[1]/a/text()')[0] 35 print(ilt['title']) 36 37 ilt['actor'] = div.xpath('./div/div/div[1]/p[2]/text()')[0].strip() 38 #print(actor.strip()) 39 40 ilt['score'] = div.xpath('./div/div/div[2]/p/i[1]/text()')[0]+div.xpath( 41 './div/div/div[' 42 '2]/p/i[2]/text()')[0] 43 #print(scores) 44 45 ilt['img'] = div.xpath('./a/img[2]/@data-src')[0] 46 #print(imgs) 47 print("\n++++++++++++++++++分界线++++++++++++++++++\n",) 48 49 info.append(ilt) 50 def Write_csv(info): 51 with open("maoyanmovies.csv","w+",newline='',encoding='utf-8') as csvfile: 52 try: 53 header = ["index","title","actor","score","img"] 54 writer = csv.DictWriter(csvfile,fieldnames=header) 55 writer.writeheader() 56 for item in info: 57 writer.writerow(item) 58 print("You're success!") 59 except: 60 print("Oh no!false") 61 62 def Write_MySQL(info): 63 db = pymysql.connect(host='localhost',port=3306,user='root',password='forever.cl',db='spiders',charset='utf8') 64 cursor = db.cursor() 65 cursor.execute('select version()') 66 data = cursor.fetchone() 67 print('Data_version:',data) 68 sql = 'create table if not exists maoyan(indexs varchar(50) not null,title varchar(60),actor varchar(100),score varchar(20),img varchar(255),primary key(indexs))' 69 cursor.execute(sql) 70 print("Create table successful") 71 for item in info: 72 keys = ','.join(item.keys()) 73 values = ','.join(['%s']*len(item.values())) 74 print(item.values()) 75 sql = 'insert into maoyan({keys}) values({values})'.format(keys=keys,values=values) 76 print(sql) 77 try: 78 cursor.execute(sql,tuple(item.values())) 79 print("Successful") 80 db.commit() 81 except: 82 print("NO!False") 83 db.rollback() 84 db.close() 85 86 def main(): 87 try: 88 start_url = "http://maoyan.com/board/4?offset=" 89 info = [] 90 for i in range(10): 91 url = start_url + str(i*10) 92 html = Get_HTML(url) 93 Parse_HTML(html,info) 94 time.sleep(3) 95 #Write_csv(info) 96 Write_MySQL(info) 97 except: 98 print("False") 99 100 if __name__ == '__main__': 101 main()
人生苦短,我用Python。
一起加油!!!