第一个爬虫小项目2109.6.1

爬取网址为:http://xuexiao.51sxue.com/slist/?o=&t=3&areaCodeS=&level=&sp=&score=&order=&areaS=%C8%AB%B9%FA&searchKey=

爬取初中跟高中页面的数据 

具体字段为 学校名字 地区及学校介绍

然后拿着地区与mjp_district_city表作比较 地区如果表里没有设置为0 有的话设置市前面的id存在sceool表里, 代码如下

import requests
from lxml import etree
import time
import pymysql
import time
import random
db = pymysql.connect(host='192.168.1.200',port=3306, user='app_mjp', password='app_mjp', db='app_mjp')

cursor = db.cursor()

# cursor.execute(sql)
# db.commit()
# data1 = cursor.fetchall()


def get_url():
    lst1 = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36 ',
        'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36',
    ]
    numbo=0
    for num in range(1,1334):
        numbo+=1
        url = "http://xuexiao.51sxue.com/slist/?o=&t=2&areaCodeS=&level=&sp=&score=&order=&areaS=%C8%AB%B9%FA&searchKey="+"&page="+str(num)
        response = requests.get(url=url,headers={"User-Agent":random.choice(lst1)}).content
        etee = etree.HTML(response)
        title=etee.xpath('//div[@class="school_m_main fl"]/li/h3//text()')
        new_url=etee.xpath('//div[@class="school_m_main fl"]/li/h3//@href')
        diqu=etee.xpath('//div[@class="school_m_main fl"]/li[2]/b//text()')
        xuexiaojieshao=get_jianjie(new_url)

        #已拿到所有的数据
        data=zip(title,diqu,xuexiaojieshao)

        # 城市表
        for i in data:
            # print(i[1][-4:])
            sql = 'select da_id from mjp_district_area where da_name="{}"'.format(i[1][-4:].strip())
            # sql = 'select da_id from mjp_district_area where da_name="{}"'.format("上城区")
            cursor.execute(sql)
            # 事务提交,否则数据库得不到更新
            db.commit()
            rs=cursor.fetchone() # 将所有的结果放入rr中
            #拿到每个地区的编号
            #使用zip 封装到一起
            if rs is None:
                rs="0"
            print(i[0],str(rs),i[2],2)
            sql_insert = "INSERT INTO mjp_school(s_name,s_da_id,s_type,s_desc,s_creat_time) VALUES(%s,%s,%s,%s,%s)"
            # 执行语句
            try:
                # 执行sql语句
                # cursor.execute(sql_insert, (i[0],rs,2))
                cursor.execute(sql_insert, (i[0],rs,2,i[2],time.time()))
                db.commit()
            except Exception as ss:
                print(ss)
                print(2)
                # 如果发生错误则回滚
                db.rollback()
        print("第%s爬取完成" % numbo)

def get_jianjie(new_url):
    lst=[]
    lst1 = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36 ',
        'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36',
    ]
    for u in new_url:
        url = u.replace("detail","content")
        res = requests.get(url=url, headers={"User-Agent":random.choice(lst1)})
        res.encoding = "gbk"
        tree = etree.HTML(res.text)
        p = "".join(
            [i.replace("\u3000\u3000", "").strip() for i in tree.xpath('//*[@id="nr_main"]/div[1]/div[2]/p//text()')]).replace("\xa0 \xa0 \xa0","")
        lst.append(p)
    return lst


if __name__ == '__main__':
    get_url()

心得: 很简单的一个小项目 搞了一天

具体困难: 1 需要连接两张表 担心会出问题 

      2 取出来的地区 前面带了个空格,导致匹配地区拿到的是none,

      3没用过多线程,后续加强练习多线程

 

posted @ 2019-06-03 16:33  山东张铭恩  阅读(233)  评论(0编辑  收藏  举报