py爬虫实战
一、糗事百科视频爬取
先找到对应的页面,分析视频的来源,通过正则匹配到链接,然后再通过“美味的汤”来获取对应的视频的标题,进行下载
import requests import re from bs4 import BeautifulSoup url="https://www.qiushibaike.com/video/" headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"} resp=requests.get(url,headers=headers) content = resp.content.decode('utf-8') soup=BeautifulSoup(content,'html.parser') text=resp.text # print(text) list=re.findall(r'<source src="(.*)" type=\'video/mp4\' />',text) divs=soup.find_all('div',class_="content") title=[] for item in divs: title.append(item.find('span').text.strip()) lst=[] for item in list: lst.append("https:"+item) count=0 for item in lst: resp=requests.get(item,headers=headers) with open("video/"+str(title[count])+".mp4","wb") as file: file.write(resp.content) print("已下载完第"+str(count)+"个") count+=1 print("视频下载完毕")
二、链家二手房数据
本次主要利用“BeautifulSoup”和mysql数据库
import re import requests import mysql.connector from bs4 import BeautifulSoup class LianJia(): mydb=mysql.connector.connect(host='localhost',user='root',password='fengge666',database='hotel') mycursor=mydb.cursor() #初始化 def __init__(self): self.url="https://bj.lianjia.com/chengjiao/pg{0}/" self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'} def send_request(self,url): resp=requests.get(url,headers=self.headers) if resp: self.parse_content(resp) def parse_content(self,resp): lst=[] html=resp.text bs=BeautifulSoup(html,'html.parser') ul=bs.find('ul',class_='listContent') li_lst=ul.find_all('li') for item in li_lst: title=item.find('div',class_='title').text houseInfo=item.find('div',class_='houseInfo').text data=item.find('div',class_='dealDate').text money=item.find('div',class_='totalPrice').text flood=item.find('div',class_='positionInfo').text price=item.find('div',class_='unitPrice').text span=item.find('span',class_='dealCycleTxt') span_lst=span.find_all('span') agent=item.find('a',class_='agent_name').text lst.append((title,houseInfo,data,money,flood,price,span_lst[0].text,span_lst[1].text,agent)) #开始存储到数据库 self.write_mysql(lst) def write_mysql(self,lst): # print(self.mydb) tuple_lst=tuple(lst) sql="insert into house (title,houseInfo,data,money,flood,price,current_money,current_data,agent) values (%s,%s,%s,%s,%s,%s,%s,%s,%s)" self.mycursor.executemany(sql,tuple_lst) self.mydb.commit() def start(self): for i in range(1,11): full_url=self.url.format(i) resp=self.send_request(full_url) if resp: self.parse_content(self,resp) if __name__ == '__main__': lianjia=LianJia() lianjia.start()
三、爬取招聘职位网信息
主要利用beautifulSoup对其网页进行分析,去除对应的数据,将其保存在xlsx文件里,对其进行分页爬取
import re import openpyxl from bs4 import BeautifulSoup import requests import time def send_request(id,page): url='https://www.jobui.com/company/{0}/jobs/p{1}'.format(id,page) headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"} resp=requests.get(url,headers=headers) return resp.text lst=[] def parse_html(html): bs=BeautifulSoup(html,'html.parser') job_lst=bs.find_all('div',class_='c-job-list') for item in job_lst: name=item.find('h3').text div_tag=item.find('div',class_='job-desc') span_tag=div_tag.find_all('span') url=item.find('a',class_='job-name')['href'] url='https://www.jobui.com'+url lst.append([name,span_tag[0].text,span_tag[1].text,url]) def save(lst): wk=openpyxl.Workbook() sheet=wk.active for item in lst: sheet.append(item) wk.save('招聘信息.xlsx') def start(id,pages): for page in range(1,pages+1): resp_data=send_request(id,page) parse_html(resp_data) time.sleep(2) save(lst) if __name__ == '__main__': id='10375749' pages=3 start(id,pages)
四、爬取QQ音乐排行旁信息
这个排行榜的信息封装在json数据里,因此需要我们对json格式进行分析,取出想要的数据,保存到数据库里
import requests import re import mysql.connector def get_request(): url="https://u.y.qq.com/cgi-bin/musics.fcg?-=getUCGI9057052357882678&g_tk=130572444&sign=zzan8er9xsqr1dg0y3e7df30d14b15a2b335cedcd0d6c6c883f&loginUin=1751520702&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0&data=%7B%22detail%22%3A%7B%22module%22%3A%22musicToplist.ToplistInfoServer%22%2C%22method%22%3A%22GetDetail%22%2C%22param%22%3A%7B%22topId%22%3A4%2C%22offset%22%3A0%2C%22num%22%3A20%2C%22period%22%3A%222020-07-23%22%7D%7D%2C%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%7D" resp=requests.get(url) return resp.json() def parse_data(): data=[] data_json=get_request() lst_song=data_json['detail']['data']['data']['song'] for item in lst_song: data.append((item['rank'],item['title'],item['singerName'])) return data def save(): mydb=mysql.connector.connect(host='localhost',user='root',password='fengge666',database='python_database') mycursor=mydb.cursor() sql='insert into song_table values(%s,%s,%s)' lst=parse_data() mycursor.executemany(sql,lst) mydb.commit() print(mycursor.rowcount,'记录插入成功') if __name__ == '__main__': save()
五、爬取12306车次信息
在这里也是对json数据进行解析,唯一有点难度的就是城市的名称和其对应的简称之间的对应,找到对应的城市与简称的json数据报,解析后,得到对应的字典。
然后再对起分析对应的数据,存储到相应的xlsx文件里。
最终的程序模式是:输入起始站,终点站以及出发时间,就会自动帮你找对应的车次以及相应的信息
import requests import re import openpyxl def send_request(begin,end,data): lst=getHcity() begin=lst[begin] end=lst[end] url='https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date={0}&leftTicketDTO.from_station={1}&leftTicketDTO.to_station={2}&purpose_codes=ADULT'.format(data,begin,end) headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36' ,'Cookie':'_uab_collina=159514803739447239524678; JSESSIONID=C6B40F3602421EA3F1C176AAAD6D07CD; tk=t4DWOVEUjJCURbdg5YymB_o_T5iBATU7sT2cagOk978qrx1x0; RAIL_EXPIRATION=1595441480862; RAIL_DEVICEID=dCwOxY9htFoUbg-W-ZiiJivIJneE0K0MYpVRFCEGJZVXr2VWjywrVdOvEJ6HKFapkeqFwD82pjGtJv0fB1SeILpr-60WLkdvjz6zV-hcnclaYrz1AcbOskdjaz3e3fJd007cLRkk4OiauQxiu6zjnhVnI4fytM01; BIGipServerpool_passport=283968010.50215.0000; route=6f50b51faa11b987e576cdb301e545c4; _jc_save_fromStation=%u90AF%u90F8%2CHDP; _jc_save_toStation=%u77F3%u5BB6%u5E84%2CSJP; _jc_save_fromDate=2020-07-19; _jc_save_toDate=2020-07-19; _jc_save_wfdc_flag=dc; BIGipServerotn=3990290698.50210.0000; BIGipServerpassport=887619850.50215.0000; uKey=97a13289be6445586b819425f91b9bcbcc15446c5f37bceb8352cc085d1017a4; current_captcha_type=Z'} resp=requests.get(url,headers=headers) resp.encoding='utf-8' # print(resp.text) return resp def pare_json(resp,city): json_ticket=resp.json() data_lst=json_ticket['data']['result'] content=data_lst[0].split('|') #遍历车次信息 lst=[] lst.append(['车次','起始站','到达站','一等座','二等座','软卧','硬卧','硬座','出行日期','出发时间','到达时间']) for item in data_lst: d=item.split('|') # d[3]为车次 # d[6]查询起始站 # d[7]查询到达站 # d[31]一等座 # d[30]二等座 # d[29]硬座 # d[23]软卧 # d[28]硬卧 # d[8] 出发时间 # d[9] 到达时间 # d[13]出行时间 lst.append([d[3],city[d[6]],city[d[7]],d[31],d[30],d[23],d[28],d[29],d[13],d[8],d[9]]) return lst def start(begin,end,data): lst=pare_json(send_request(begin,end,data),getcity()) wk = openpyxl.Workbook() sheet = wk.active for item in lst: sheet.append(item) wk.save('车票查询.xlsx') def getcity(): url="https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9151" headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'} resp=requests.get(url,headers=headers) resp.encoding='utf-8' stations=re.findall('([\u4e00-\u9fa5]+)\|([A-Z]+)',resp.text) stations_data=dict(stations) stations_d={} for item in stations_data: stations_d[stations_data[item]]=item return stations_d def getHcity(): url = "https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9151" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'} resp = requests.get(url, headers=headers) resp.encoding = 'utf-8' stations = re.findall('([\u4e00-\u9fa5]+)\|([A-Z]+)', resp.text) stations_data = dict(stations) return stations_data if __name__ == '__main__': begin=input('begin:') end=input('end:') data=input('data:') start(begin,end,data)
六、笔趣阁小说爬取
闲来无事,回忆当初看过的奇幻小说,想来就盘他吧!!!
import re import requests from bs4 import BeautifulSoup import demjson import pymysql import os headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息 url='https://www.xsbiquge.com/12_12735/' ht='https://www.xsbiquge.com' r=requests.get(url,headers=headers) content=r.content.decode('utf-8') soup = BeautifulSoup(content, 'html.parser') if not os.path.exists('校园纯情霸主小说'): os.mkdir('校园纯情霸主小说') content=soup.find('div',id='list') Title = str(content.dt.text[:-2]).strip() print(Title) #len(content.dl.contents)-1 lst_dd=content.find_all('dd') for item in lst_dd: link=ht+item.a['href'] name=item.a.text r = requests.get(link, headers=headers) content = r.content.decode('utf-8') soup = BeautifulSoup(content, 'html.parser') text=soup.find('div',id='content').text.replace(' ','\n') with open('校园纯情霸主小说' + '/' + name + '.txt', 'a',encoding='utf-8')as f: f.write(text) print("已完成"+name+" 的爬取!!!")
七、毒物特征与症状提取
这是和老师正在做的一个项目,需要将毒物的特征描述文字与对应的一些症状进行匹对,然后生成一个表,又该症状则为1,反之则为0
import csv import re import pandas as pd import openpyxl import xlrd lst_xlsx=['二级其他检查.xlsx','二级库呼吸.xlsx','二级库尿液检查.xlsx','二级库循环.xlsx','二级库泌尿生殖.xlsx','二级库消化.xlsx','二级库甲状腺.xlsx','二级库皮肤.xlsx','二级库眼部.xlsx','二级库神经.xlsx', '二级库粪便检查.xlsx','二级库肌电图.xlsx','二级库肝功能肾功能.xlsx','二级库胃部检查.xlsx','二级库脑电图.xlsx','二级库血液检查.xlsx','二级心电图.xlsx','其它.xlsx'] name_lst = [] content_lst = [] ans_lst=[] def get_dw(): wk = xlrd.open_workbook('duwuxiangqing.xls') table = wk.sheet_by_name('Sheet1') rows = table.nrows cols = table.ncols for i in range(2, rows): name = table.cell(i, 2).value str = table.cell(i, 13).value + table.cell(i, 14).value + table.cell(i, 17).value + table.cell(i, 18).value content_lst.append(str) name_lst.append(name) # for i in range(len(content_lst)): # print(name_lst[i],content_lst[i]) def get_title(): lst_title=[] lst_title.append("Name") for item in lst_xlsx: wk = openpyxl.load_workbook(item) sheet = wk.active rows = sheet.max_row cols = sheet.max_column for i in range(1, cols + 1): for j in range(1, rows + 1): size = sheet.cell(j, i).value if (size != None): lst_title.append(size) return lst_title def head(): global flag, num lst_title=[] lst_title.append("Name") #构建毒物名字 for item in name_lst: k=[] k.append(item) ans_lst.append(k) #遍历毒物特征 p=0 for cont_item in content_lst: cont_item=str(cont_item) print("目前正在进行第:"+str(p)) #读取二级库里的特征 for item in lst_xlsx: wk = openpyxl.load_workbook(item) sheet = wk.active rows = sheet.max_row cols = sheet.max_column for i in range(1, cols + 1): flag=0 num=0 for j in range(1, rows + 1): size = sheet.cell(j, i).value if(size!=None): num+=1 if cont_item.find(size)>0 : flag=1 if flag==1: for f in range(num): ans_lst[p].append("1") else: for f in range(num): ans_lst[p].append("0") p=p+1 lst_title=get_title() write_title(lst_title,ans_lst) def write_title(title,ans): f = open('data.csv', 'w',encoding='utf-8',newline='') # 2. 基于文件对象构建 csv写入对象 csv_writer = csv.writer(f) # 3. 构建列表头 csv_writer.writerow(title) for item in ans: csv_writer.writerow(item) if __name__ == '__main__': get_dw() head()
data.csv部分截图: