爬虫练习
#豆瓣电影 re爬虫
import requests,re,csv url = "https://movie.douban.com/top250" headers={ "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0" } resp = requests.get(url,headers=headers) page_connect = resp.text #解析数据 obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)' r'</span>.*?<p class="">.*?<br>(?P<year>.*?) .*?<span ' r'class="rating_num" property="v:average">(?P<score>.*?)</span>.*?' r'<span>(?P<num>.*?)人评价</span>',re.S) #开始匹配 result = obj.finditer(page_connect) f = open("data.csv",mode="w") csvwriter = csv.writer(f) for it in result: # print(it.group("name")) # print(it.group("score")) # print(it.group("num")) # print(it.group("year").strip()) #srtip去空格 #使用字典 dic = it.groupdict() dic['year'] = dic['year'].strip() csvwriter.writerow(dic.values()) f.close() print("over!")
#电影天堂 re爬虫
import requests,re,csv url = "https://www.dytt8.net/" headers={ "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0" } resp = requests.get(url,headers=headers,verify=False) resp.encoding = 'gb2312' #指定字符集 #匹配ul的ui obj1 = re.compile(r"最新影片推荐.*?<ul>(?P<ul>.*?)</ul>",re.S) obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S) obj3 = re.compile(r'◎片 名 (?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<download>.*?)">', re.S) result1 = obj1.finditer(resp.text) #保存子页面 list = [] for it in result1: ul = it.group('ul') #提取子页面链接 result2 = obj2.finditer(ul) for itt in result2: #子页面链接 url2 = url + itt.group('href').strip("/") list.append(url2) #print(url2) #提取子页面内容 for href in list: url2 = requests.get(href, headers=headers, verify=False) url2.encoding = 'gb2312' # 指定字符集 #print(url2.text) result3 = obj3.search(url2.text) print(result3.group("movie")) print(result3.group("download")) break
#bs4爬虫 import requests,re,csv from bs4 import BeautifulSoup url = "http://www.bjtzh.gov.cn/bjtz/home/jrcj/index.shtml" headers={ "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0" } resp = requests.get(url,headers=headers) resp.encoding = 'utf-8' #指定字符集 #写入文件 f = open("菜价.csv",mode="w") csvwriter = csv.writer(f) #解析数据 #1.把源代码交给beautifulSoup进行处理,生成bs对象 page = BeautifulSoup(resp.text,"html.parser") #指定html解析 #2.从bs对象查找数据 #find(标签,属性=值) #find_all(标签,属性=值) #第一种写法 #div = page.find("div",class_="m-r-main m-textLists") #class是python的关键字,这里用class_区分,防止报错 #第二种写法 div = page.find("div",attrs={"class":"m-r-main m-textLists"}) #第二种可以避免class #拿到所有数据行tr trs = div.find_all("tr")[1:] #[1:]做切片 从第1个开始 for tr in trs: #每行数据 tds = tr.find_all("td") #每行的td name = tds[0].text class1 = tds[1].text high = tds[2].text avg = tds[3].text #print(name,class1,high,avg) csvwriter.writerow([name,class1,high,avg]) f.close() print("over!")
#彼岸壁纸爬取下载。 提前创建img文件夹或者修改脚本 import requests,re,csv,time from bs4 import BeautifulSoup url = "https://pic.netbian.com/4kmeinv/" url1 = "https://pic.netbian.com" headers={ "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0" } resp = requests.get(url,headers=headers) resp.encoding = 'gbk' #指定字符集 #print(resp.text) #解析数据 page = BeautifulSoup(resp.text,"html.parser") #指定html解析 #2.从bs对象查找数据 #find(标签,属性=值) #find_all(标签,属性=值) div = page.find("div",class_="slist").find_all("a") #print(div) for a in div: href = url1+(a.get('href')) #print(href) #获取子页面源码 resp2 = requests.get(href, headers=headers) resp2.encoding = 'gbk' # 指定字符集 page2 = BeautifulSoup(resp2.text, "html.parser") div2 = page2.find("div",class_="photo-pic") img = div2.find("img") src = url1+(img.get("src")) #print(src) #下载图片 img_resp = requests.get(src) #img_resp.content #获取字节 img_name = src.split("/")[-1] # 获取最后/的内容,举例https://pic.netbian.com/uploads/allimg/210831/102129-163037648996ad.jpg # 从中获取 102129-163037648996ad.jpg with open("img/"+img_name,mode="wb") as f: #放入img文件夹 f.write(img_resp.content) #图片内容写入文件 f.close() print(img_name +" is Download OK") time.sleep(0.5) print("OVER")
#线程池+xpath提取 import requests,re,csv,lxml from lxml import etree from concurrent.futures import ThreadPoolExecutor headers={ "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0" } f = open("1.csv",mode="w",encoding="UTF-8") csvwriter = csv.writer(f) def page1(url): resp = requests.get(url,headers=headers) #resp.encoding = "UTF-8" # 指定字符集 #print(resp.text) html = etree.HTML(resp.text) table = html.xpath("/html/body/div[4]/div[3]/div[3]/table[2]/tbody")[0] #print(table) trs = table.xpath("./tr") #截取tr for tr in trs: txt = tr.xpath("./td/text()") #print(txt) #对数据做简单的处理 txt = (item.replace("\xa0","") for item in txt) #print(list(txt)) #存放数据 csvwriter.writerow(txt) print(url+"提取完成") if __name__ == '__main__': #page1("http://www.maicainan.com/offer/show/classid/14/id/4652.html") #创建线程池 with ThreadPoolExecutor(50) as t: #500个线程 for i in range(11,99): #200个任务 #任务提交到线程池 t.submit(page1,f"http://www.maicainan.com/offer/show/classid/14/id/46{i}.html") print("全部提取完成")