爬取https://www.parenting.com/baby-names/boys/earl网站top10男女生名字及相关信息
爬取源代码如下:
import requests import bs4 from bs4 import BeautifulSoup import re import pandas as pd import io import sys sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') lilist=[] r=requests.get('https://www.parenting.com/baby-names/boys/earl') soup=BeautifulSoup(r.text,"lxml") soup= soup.find_all('a',href=True) for i in soup: if 'https://www.parenting.com/pregnancy/baby-names/baby-boy-names/' in str(i)or'https://www.parenting.com/pregnancy/baby-names/girl-baby-names/' in str(i): lilist.append(i.get("href")) lilist1=[] results1=[] results=[] results2=[] for i in list(set(lilist)): r=requests.get(i) soup=BeautifulSoup(r.text,"lxml") Source=soup.find_all('p') Source=soup.find_all(attrs={'class': 'description'}) results0 = re.findall('<h4>(.*?)</h4>', r.text) for c in results0: if c!='': lilist1.append(c) #print(lilist1) #lilist1=[] pattern = re.compile('<p><strong>Origin:</strong>\s(.*?)</p>', re.S) results += re.findall(pattern, str(Source)) pattern1 = re.compile('<p><strong>Meaning:</strong>\s(.*?)</p>', re.S) results1 += re.findall(pattern1, str(Source)) pattern2 = re.compile("<p><strong>Why it’s big:</strong>\s(.*?)</p>", re.S) results2 += re.findall(pattern2, str(Source)) print(lilist1) print(results1) print(results) print(results2) data = { 'EnName':lilist1, 'Meaning':results1, 'Origin':results, 'Description':results2 } frame = pd.DataFrame(data) frame.to_csv('wt10.csv',encoding="gb18030") #print(results2)
csv文件截图: