爬取英文名详细内容
csv存储使用html_save(s)函数
图片存储使用pic_save(url,name)函数
爬取时首先爬取首页所有英文名详细内容的链接并存入列表,然后将列表中的链接依次爬取,并调用存储函数存储价值数据。
1 import sys 2 import io 3 import re 4 sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') 5 import requests 6 from bs4 import BeautifulSoup 7 from urllib import request 8 9 def html_save(s): 10 with open('Name.csv','a',encoding='gb18030')as f: 11 f.write(s+'\n') 12 def pic_save(url,name): 13 root="C://Users//L//Desktop//ba//" 14 # name=url.split('=')[-1] 15 path=root+name+'.jpg' 16 r=requests.get(url) 17 with open(path,'wb')as f: 18 f.write(r.content) 19 f.close() 20 # print('ok') 21 # soup = BeautifulSoup(html,'index') 22 def getName_link(): 23 lst=[] 24 url='http://www.babynology.com/baby-boy-names.html' 25 r=requests.get(url) 26 soup= BeautifulSoup(r.text,'html.parser') 27 # soup = BeautifulSoup(open('Girl.html')) 28 for div in soup.find_all('div',{'class':'babynology_textevidence babynology_bg_grey babynology_shadow babynology_radius left overflow_scroll'}): 29 for strong in div.find_all('strong'): 30 # print(strong.find_all('a')[0].text.replace(' ','').replace(' ','').replace('\n','')) 31 # print(strong.find_all('a')[0].get('href').replace('\n','')) 32 i=strong.find_all('a')[0].text.replace(' ','').replace(' ','').replace('\n','') 33 j=strong.find_all('a')[0].get('href').replace('\n','') 34 lst.append(j) 35 # html_save(i) 36 # html_save(j) 37 # # print(lst) 38 return lst 39 40 def hh(lst): 41 for i in lst: 42 url=i 43 # url='http://www.babynology.com/name/bahula-m.html' 44 r=requests.get(url) 45 soup= BeautifulSoup(r.text,'html.parser') 46 name=soup.find('h2',{'class':'txtclrm name-head2'}).text 47 print("Name:",name) 48 # print(soup) 49 #gender=soup.find('div',{'class':'grid grid_8'})#.find('div',{'class':'babynology_textevidence babynology_width_percentage40 babynology_width100_responsive'}) 50 gender=soup.find('h5',{'style':'color:#000;'}).text 51 print("Gender:",gender) 52 # Numerology=soup.find('h5',{'style':'color:#000; text-align:justify;'}).stripped_strings 53 # font=soup.find('h5',{'style':'color:#000; text-align:justify;'}).find('font').text 54 # print(type(Numerology)) 55 # Numerology=str(Numerology) 56 Numerology=soup.find('h5',{'style':'color:#000; text-align:justify;'}).text.replace(' ','').replace('\n','').replace(' ','') 57 a=soup.find('h5',{'style':'color:#000; text-align:justify;'}).find('span').text.replace(' ','').replace('\n','').replace(' ','') 58 b=soup.find('h5',{'style':'color:#000; text-align:justify;'}).find('script').text.replace(' ','').replace('\n','').replace(' ','') 59 n=Numerology.strip(b).strip(a) 60 # print(name,'Numerology:',font,"%s"%list(Numerology)[1].replace('\n','').replace(' ','').replace(' ','')) 61 print(name,'Numerology:',Numerology.strip(b).strip(a)) 62 n=Numerology.strip(b).strip(a) 63 n=name+' Numerology:'+n 64 n=n.replace(',',' ').replace(',',' ') 65 # n=n.encode('UTF-8','ignore').decode('UTF-8') 66 print(n) 67 # url='http://www.babynology.com/name/bahula-m.html' 68 r=requests.get(url) 69 pic=soup.find('img',{'style':'margin-left:-10px; margin-top:-5px;'}).get('src') 70 # print(pic) 71 html_save('Name:'+name) 72 html_save('Gender:'+gender) 73 html_save(n) 74 pic_save(pic,name) 75 # html_save('--------------------------------------------------------------------------------------------------------------------------') 76 print('---------------------------------------------------------------------------') 77 # print(name,'Numerology:',Numerology.strip(b).strip(a)) 78 hh(getName_link())