一些爬虫代码

基于xpath的爬虫

​ 爬取起点的热门书籍名称,作者,月票以及简介,并将结果保存在xiaoshuo.txt中

import requests
from lxml import etree
import time
import sys		#以下三行是为了解决编码报错问题
reload(sys)
sys.setdefaultencoding("utf8")

fo = open("xiaoshuo.txt","w")
i=1
for i in range(5):
    url = "https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page=%d"%i
    header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
    data = requests.get(url,headers=header).text
    f = etree.HTML(data)

    hrefs = f.xpath('/html/body/div[1]/div[5]/div[2]/div[2]/div/ul/li/div[2]/h4/a/@href')
    for href in hrefs:
        href = "https:"+href
        book = requests.get(href,headers=header).text
        e = etree.HTML(book)    
        title = e.xpath('/html/body/div/div[6]/div[1]/div[2]/h1/em/text()')[0]
        zuozhe = e.xpath('/html/body/div/div[6]/div[1]/div[2]/h1/span/a/text()')[0]
        jieshao = e.xpath('/html/body/div/div[6]/div[4]/div[1]/div[1]/div[1]/p/text()')
        yuepiao = e.xpath('//*[@id="monthCount"]/text()')[0]
        str = '<----->'+title+'<----->'+zuozhe+'<----->'+yuepiao+'\n'
        fo.write(str)
        for te in jieshao:
            fo.write(te)

fo.close()

基于selenium的爬虫

​ 目的是爬取校园网上个人基本信息,未完成。最终目的是做出批量查询(学号密码有固定形式)

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time	

#由于find.element_by_*始终无法定位到需要点击的按钮上,无法进入下一页,下一步准备尝试与requests库连用

driver = webdriver.Chrome()
driver.get("http://cas.hdu.edu.cn/cas/login?service=http%3A%2F%2Fonce.hdu.edu.cn%2Fdcp%2Findex.jsp")
elem1 = driver.find_element_by_id("un")
elem2 = driver.find_element_by_id("pd")
elem1.send_keys("学号")		#将学号密码替换为自己的真实学号密码
elem2.send_keys("密码")
driver.find_element_by_id('index_login_btn').click()
driver.find_element_by_class_name('quickhome_item_link').click()
print driver.page_source

基于正则表达式

​ 贴吧图片批量下载

import urllib
import re

def gethtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

def getimg(html):
    reg = r'src="(.+?\.jpg)" size'
    imgre= re.compile(reg)
    imglist = re.findall(imgre,html)
    return imglist

def downimg(imglist):
    x=0
    local = 'D:/VScode/image/'
    for img in imglist:
        urllib.urlretrieve(img,local+'%s.jpg'%x)
        x+=1
            
html = gethtml("https://movie.douban.com/subject/26942674/")
print html

posted @ 2020-07-31 22:16  iloveacm  阅读(3134)  评论(0编辑  收藏  举报