《手牵手带你走进python世界》系列二

爬虫系列

什么是爬虫？都有哪些爬虫？我学了爬虫要找什么样的工作？工资有多少？
- 网络爬虫（又被称为网页蜘蛛，网络机器人，在FOAF社区中间，更经常的称为网页追逐者），是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本。
- 爬虫一般分为通用爬虫和聚焦爬虫。
- 一般是需要找爬虫工程师或者数据分析工程师。
- 老男孩python学院就最近几期就业情况来看，爬虫工程师的价格大约在15k左右。
爬虫常用的模块和框架？
- 常用的模块有 requests,bs4,lxml,re,selenium,appium等模块。
- 常见的爬虫框架有scrapy框架和PySpider框架等等。

开胃小菜，实现在线翻译的语句

import requests
import json
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}


def main(keys=''):
    url = 'http://fy.iciba.com/ajax.php?a=fy'
    data = {
        'f': 'auto',
        't': 'auto',
        'w': keys
    }
    response = requests.post(url,headers=headers,data=data)
    info = response.text
    data_list = json.loads(info)
    try:
        val = data_list['content']['word_mean'] # 中文转英文
    except:
        val = data_list['content']['out']  # 英文转中文
    return val

if __name__ == '__main__':
    keys = input('请输入需要翻译的英文或者中文...')
    if not keys:
          print('请您正确输入需要翻译的中文或者英文...')
    else:
        data = main(keys)
        print(data)

爬取豆瓣电影Top250

import requests
from bs4 import BeautifulSoup
from openpyxl import *

url = 'https://movie.douban.com/top250'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}

# 获取响应值
response = requests.get(url=url,headers=headers)
data = response.text


soup = BeautifulSoup(data,'html.parser')

ol = soup.find(name='ol',attrs={"class":"grid_view"})
li_list = ol.find_all(name='li')

wb = Workbook()

sheet = wb.active
sheet['A1'].value = '序号'
sheet['B1'].value = '名称'
sheet['C1'].value = '评分'
sheet['D1'].value = '摘要'
sheet['E1'].value = '图片'

for li in li_list:
    index = li_list.index(li)+1
    name = li.find(name='span',attrs={'class','title'})
    rate = li.find(name='span',attrs={'class','rating_num'})
    inq = li.find(name='span',attrs={'class','inq'})
    img = li.find(name='img')
    imgs = img['src']
    sheet['A'+str(index+1)].value = index
    sheet['B'+str(index+1)].value = name.text
    sheet['C'+str(index+1)].value = rate.text
    sheet['D'+str(index+1)].value = inq.text
    sheet['E'+str(index+1)].value = imgs

wb.save('douban.xlsx')

爬取汽车之家新闻

import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook


def run(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    response.encoding = 'gbk'
    soup = BeautifulSoup(response.text,'html.parser')
    # 获取ul
    ul = soup.find(name='ul',attrs={"class":"article"})
    # 获取所有的li
    li_list = ul.find_all(name='li')
    infos = []
    for li in li_list:
        name = li.find(name="h3")
        name1 = ""
        if name:
            name1 = (name.text)
        href = li.find(name='a')
        href1 = ""
        if href:
            href1 = ('http:'+href['href'])
        info = li.find(name='p')
        info1 = ""
        if info:
            info1 = (info.text)
        infos.append({"title":name1,"href":href1,"info":info1})
    print(infos)

if __name__ == '__main__':
    url = 'https://www.autohome.com.cn/news/'
    run(url)

posted @ 2019-06-19 00:13 巫小诗阅读(271) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

巫小诗

《手牵手带你走进python世界》系列二

爬虫系列

公告