爬网页

# coding=utf-8  
  
import lxml, bs4, re, requests  


csvContent=''  
  
# file = open('D:\\tyc_demo.htm','rb')  
# soup = bs4.BeautifulSoup(file,'html.parser') 

resultPage = requests.get("http://mp.weixin.qq.com/s/u_WmkE5meMWuZ81G5gHhBQ")
soup = bs4.BeautifulSoup(resultPage.text,'html.parser') 
  
for link in soup.find_all('a'):
    
    if (link.get('href')).startswith('http://mp.weixin.qq.com') :
#         print(link.get('href'))
        resultPage = requests.get(link.get('href'))
         
        tempSoup = bs4.BeautifulSoup(resultPage.text,'html.parser')  
        pics=tempSoup.find_all(attrs={'class': 'rich_media_title'})
        title =pics[0].string.strip()
        
        title= title.replace("计算机程序的思维逻辑","")
        title= title.replace(")","")
        title= title.replace("(","")
        title= title.replace(":","")
        
        title= title.strip()
        if title[-1:]=="/":
            title= title[0:-1]
        
        
        print(title)
        fileName='D:\\Java编程的逻辑\\'+title+'.html'
         
        with open(fileName,'w',encoding='utf-8') as of:  
          of.write(resultPage.text)
效果如下
posted on 2018-03-01 18:21 刘达人186 阅读(108) 评论(0) 收藏举报