爬取崔庆才大神的爬虫教程,最后存储到mysql
1 # -*- coding: utf-8 -*- 2 #coding:utf8 3 import requests,time,unittest 4 from lxml import etree 5 import pymysql 6 url ='http://cuiqingcai.com/1052.html' 7 8 head = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36", 9 "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 10 "Accept-Encoding":"gzip, deflate", 11 "Accept-Language":"zh-CN,zh;q=0.8" 12 13 } 14 html = requests.get(url) 15 16 selector = etree.HTML(str(html.text)) 17 18 r = selector.xpath("/html/body/section/div[3]/div/article/p/a/text()") 19 t = selector.xpath("/html/body/section/div[3]/div/article/p/a/@href") 20 #print(r) 21 #print(t) 22 now = str(time.strftime('%Y-%m-%d-%H-%M',time.localtime(time.time()))) 23 24 class datas(object): 25 26 def Data_storage(self,dt): 27 28 for each in dt: 29 self.uid=",".join(each).split(',')[0] 30 self.uname=",".join(each).split(',')[1] 31 self.ulink=",".join(each).split(',')[2] 32 uid="\'"+str(self.uid)+"\'" 33 uname="\'"+str(self.uname)+"\'" 34 ulink="\'"+str(self.ulink)+"\'" 35 time="\'"+str(now)+"\'" 36 r="\'"+str('ggg')+"\'" 37 #print(uid,uname,ulink,r,time) 38 39 conn=pymysql.connect(host='192.168.191.1',user='root',passwd='123456789',db='data',port=3306,charset='utf8') 40 cur=conn.cursor()#获取一个游标 41 sql ='''INSERT INTO xxb(id,name,remark,link,time)VALUES(%s,%s,%s,%s,%s)'''%(uid,uname,r,ulink,time) 42 43 cur.execute(sql) 44 cur.execute('select * from xxb')#执行查询sql语句+ 45 data=cur.fetchall()#执行查询后获取的数据赋值给data变量,每次查到的数据是上一个sql语句的结果,如果要重新查询别的数据,则要在上面重新写个sql查询语句 46 47 cur.close()#关闭游标 48 conn.commit()#事务提交 49 conn.close()#释放数据库资源 50 51 def Data_processing(self): 52 a=[] 53 for i in range(1,31): 54 a.append(str(i)) 55 dt = list(zip(a,r,t)) 56 self.Data_storage(dt) 57 58 if __name__=="__main__": 59 gg = datas() 60 gg.Data_processing()