第一个爬虫小项目2109.6.1
爬取网址为:http://xuexiao.51sxue.com/slist/?o=&t=3&areaCodeS=&level=&sp=&score=&order=&areaS=%C8%AB%B9%FA&searchKey=
爬取初中跟高中页面的数据
具体字段为 学校名字 地区及学校介绍
然后拿着地区与mjp_district_city表作比较 地区如果表里没有设置为0 有的话设置市前面的id存在sceool表里, 代码如下
import requests from lxml import etree import time import pymysql import time import random db = pymysql.connect(host='192.168.1.200',port=3306, user='app_mjp', password='app_mjp', db='app_mjp') cursor = db.cursor() # cursor.execute(sql) # db.commit() # data1 = cursor.fetchall() def get_url(): lst1 = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36 ', 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36', ] numbo=0 for num in range(1,1334): numbo+=1 url = "http://xuexiao.51sxue.com/slist/?o=&t=2&areaCodeS=&level=&sp=&score=&order=&areaS=%C8%AB%B9%FA&searchKey="+"&page="+str(num) response = requests.get(url=url,headers={"User-Agent":random.choice(lst1)}).content etee = etree.HTML(response) title=etee.xpath('//div[@class="school_m_main fl"]/li/h3//text()') new_url=etee.xpath('//div[@class="school_m_main fl"]/li/h3//@href') diqu=etee.xpath('//div[@class="school_m_main fl"]/li[2]/b//text()') xuexiaojieshao=get_jianjie(new_url) #已拿到所有的数据 data=zip(title,diqu,xuexiaojieshao) # 城市表 for i in data: # print(i[1][-4:]) sql = 'select da_id from mjp_district_area where da_name="{}"'.format(i[1][-4:].strip()) # sql = 'select da_id from mjp_district_area where da_name="{}"'.format("上城区") cursor.execute(sql) # 事务提交,否则数据库得不到更新 db.commit() rs=cursor.fetchone() # 将所有的结果放入rr中 #拿到每个地区的编号 #使用zip 封装到一起 if rs is None: rs="0" print(i[0],str(rs),i[2],2) sql_insert = "INSERT INTO mjp_school(s_name,s_da_id,s_type,s_desc,s_creat_time) VALUES(%s,%s,%s,%s,%s)" # 执行语句 try: # 执行sql语句 # cursor.execute(sql_insert, (i[0],rs,2)) cursor.execute(sql_insert, (i[0],rs,2,i[2],time.time())) db.commit() except Exception as ss: print(ss) print(2) # 如果发生错误则回滚 db.rollback() print("第%s爬取完成" % numbo) def get_jianjie(new_url): lst=[] lst1 = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', 'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36 ', 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36', ] for u in new_url: url = u.replace("detail","content") res = requests.get(url=url, headers={"User-Agent":random.choice(lst1)}) res.encoding = "gbk" tree = etree.HTML(res.text) p = "".join( [i.replace("\u3000\u3000", "").strip() for i in tree.xpath('//*[@id="nr_main"]/div[1]/div[2]/p//text()')]).replace("\xa0 \xa0 \xa0","") lst.append(p) return lst if __name__ == '__main__': get_url()
心得: 很简单的一个小项目 搞了一天
具体困难: 1 需要连接两张表 担心会出问题
2 取出来的地区 前面带了个空格,导致匹配地区拿到的是none,
3没用过多线程,后续加强练习多线程