爬取猫眼电影TOP100

这次是第一次存储在MySQL中,所以选择了相对简单易爬取的猫眼电影排行,但还是出现了问题。当我在创建表的时候,整个程序突然报错了,查看了一下sql语句,发现并没有什么错误,这是我直观的感受,下面是我创建表失败的代码以及报错的问题:

  

    sql = 'create table if not exists maoyan(index varchar(50) not null,title varchar(60),actor varchar(100),score varchar(20),img varchar(255),primary key(indexs))'
TypeError: %d format: a number is required, not str

一开始碰到这个问题真的是疯了,怎么都找不到原因。后面经过网上查找资料,就尝试将index改为indexs,说实话我到现在也不知道为什么要这样,如果有同学知道,请不吝赐教,最后就直接展示代码。

  1 # !/usr/bin/env python
  2 # encoding: utf-8
  3 
  4 import requests
  5 from lxml import etree
  6 import csv,pymysql
  7 import time
  8 
  9 headers = {
 10     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,'
 11              'image/webp,image/apng,*/*;q=0.8',
 12     'Connection': 'keep-alive',
 13     'Host': 'maoyan.com',
 14     'Referer': 'http://maoyan.com/board/',
 15     'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
 16 }
 17 
 18 def Get_HTML(url):
 19     r = requests.get(url,headers=headers,)
 20     r.raise_for_status()
 21     r.encoding = r.apparent_encoding
 22     #print(r.text)
 23     return r.text
 24 
 25 def Parse_HTML(html,info):
 26     s = etree.HTML(html)
 27     divs = s.xpath('//*[@id="app"]/div/div/div[1]/dl/dd')
 28     for div in divs:
 29         ilt = {}
 30 
 31         ilt['indexs'] = div.xpath('./i/text()')[0]
 32         #print(type(ilt['indexs']))
 33 
 34         ilt['title'] = div.xpath('./div/div/div[1]/p[1]/a/text()')[0]
 35         print(ilt['title'])
 36 
 37         ilt['actor'] = div.xpath('./div/div/div[1]/p[2]/text()')[0].strip()
 38         #print(actor.strip())
 39 
 40         ilt['score'] = div.xpath('./div/div/div[2]/p/i[1]/text()')[0]+div.xpath(
 41             './div/div/div['
 42                            '2]/p/i[2]/text()')[0]
 43         #print(scores)
 44 
 45         ilt['img'] = div.xpath('./a/img[2]/@data-src')[0]
 46         #print(imgs)
 47         print("\n++++++++++++++++++分界线++++++++++++++++++\n",)
 48 
 49         info.append(ilt)
 50 def Write_csv(info):
 51     with open("maoyanmovies.csv","w+",newline='',encoding='utf-8') as csvfile:
 52         try:
 53             header = ["index","title","actor","score","img"]
 54             writer = csv.DictWriter(csvfile,fieldnames=header)
 55             writer.writeheader()
 56             for item in info:
 57                 writer.writerow(item)
 58             print("You're success!")
 59         except:
 60             print("Oh no!false")
 61 
 62 def Write_MySQL(info):
 63     db = pymysql.connect(host='localhost',port=3306,user='root',password='forever.cl',db='spiders',charset='utf8')
 64     cursor = db.cursor()
 65     cursor.execute('select version()')
 66     data = cursor.fetchone()
 67     print('Data_version:',data)
 68     sql = 'create table if not exists maoyan(indexs varchar(50) not null,title varchar(60),actor varchar(100),score varchar(20),img varchar(255),primary key(indexs))'
 69     cursor.execute(sql)
 70     print("Create table successful")
 71     for item in info:
 72         keys = ','.join(item.keys())
 73         values = ','.join(['%s']*len(item.values()))
 74         print(item.values())
 75         sql = 'insert into maoyan({keys}) values({values})'.format(keys=keys,values=values)
 76         print(sql)
 77         try:
 78             cursor.execute(sql,tuple(item.values()))
 79             print("Successful")
 80             db.commit()
 81         except:
 82             print("NO!False")
 83             db.rollback()
 84     db.close()
 85 
 86 def main():
 87     try:
 88         start_url = "http://maoyan.com/board/4?offset="
 89         info = []
 90         for i in range(10):
 91             url = start_url + str(i*10)
 92             html = Get_HTML(url)
 93             Parse_HTML(html,info)
 94             time.sleep(3)
 95             #Write_csv(info)
 96             Write_MySQL(info)
 97     except:
 98         print("False")
 99 
100 if __name__ == '__main__':
101     main()
View Code

人生苦短,我用Python。

一起加油!!!

 

 

posted @ 2018-07-03 10:17  iqunqunqun  阅读(153)  评论(0编辑  收藏  举报