python爬虫之BeautifulSoup

复制代码
# -*- coding: UTF-8 -*-
import re
from bs4 import BeautifulSoup
import requests
import codecs
import sys  
reload(sys)  
sys.setdefaultencoding('utf8') 

def mei_url():
    url = 'http://mdl.com/product'
    web_data = requests.get(url)
    web_data.encoding = 'utf-8'
    soup = BeautifulSoup(web_data.text, 'lxml')
    return soup
    
def mei_info(sub_url='/product/item/293410'):
    url = 'http://mdl.com'+sub_url
    web_data = requests.get(url)
    web_data.encoding = 'utf-8'
    soup = BeautifulSoup(web_data.text, 'lxml')
    title=soup.select('#main > div.boundary > div > div.container__main > div.section.section-info.clearfix > h2')[0].get_text()
    introduce=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text')[0].get_text()
    effect=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text > span')[0].get_text()
    crowd=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text')[2].get_text()
    print  title
    with codecs.open(r'E:\note\mei_infov3.txt', "a+",'utf8') as file: 
        file.write('&'.join(map(lambda x:str(x),[title,introduce,effect,crowd])))
        file.write('\n')
        file.write('$')
if __name__=='__main__':
    
    # items=mei_url()
    # items=str(items)
    soup1 = BeautifulSoup(open(r'E:\note\mei.htm'),'lxml')
    items1=str(soup1)
    url_list1=re.findall(r'/product/item/\d{6}',items1 )
    soup2 = BeautifulSoup(open(r'E:\note\mei2.htm'),'lxml')
    items2=str(soup2)
    url_list2=re.findall(r'/product/item/\d{6}',items2 )
    url_list3=url_list1+url_list2
    print len(url_list3)
    for sub_url in url_list3:
        mei_info(sub_url)


    
复制代码

 

posted @   Mars.wang  阅读(440)  评论(0编辑  收藏  举报
编辑推荐:
· AI与.NET技术实操系列:基于图像分类模型对图像进行分类
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
阅读排行:
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 零经验选手,Compose 一天开发一款小游戏!
· 通过 API 将Deepseek响应流式内容输出到前端
点击右上角即可分享
微信分享提示