python-爬虫

No1:

# -*- coding:utf-8 -*-

import urllib.request

import urllib

ua_headers = {"User-Agent": "..."}
request = urllib.request.Request("http://www.baidu.com", headers=ua_headers)
response = urllib.request.urlopen(request)
html = response.read()
print(html)

No2:

# -*- coding:utf-8 -*-
import urllib.request
from urllib import parse


def loadPage(url, filename):
    print("正在下载" + filename)
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
    request = urllib.request.Request(url, headers=headers)
    return urllib.request.urlopen(request).read()


def writePage(html, filename):
    print("正在保存" + filename)
    with open(filename, "wb+") as f:
        f.write(html)
    print("-" * 30)


def tiebaSpider(url, beginPage, endPage):
    for page in range(beginPage, endPage + 1):
        pn = (page - 1) * 50
        filename = "" + str(page) + "页.html"
        fullurl = url + "&pn=" + str(pn)
        html = loadPage(fullurl, filename)
        writePage(html, filename)
        print("谢谢使用")


if __name__ == '__main__':
    kw = input("请输入需要爬去的贴吧名")
    beginPage = int(input("请输入起始页:"))
    endPage = int(input("请输入结束页"))

    url = "http://tieba.baidu.com/f?"
    key = parse.urlencode({"kw": kw})
    fullurl = url + key
    tiebaSpider(fullurl, beginPage, endPage)

No3:

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import urllib.request
from urllib import parse

# 通过抓包的方式获取的url,并不是浏览器上显示的url
url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null"

# 完整的headers
headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
}

# 用户接口输入
key = input("请输入需要翻译的文字:")

# 发送到web服务器的表单数据
formdata = {
    "type": "AUTO",
    "i": key,
    "doctype": "json",
    "xmlVersion": "1.8",
    "keyfrom": "fanyi.web",
    "ue": "UTF-8",
    "action": "FY_BY_CLICKBUTTON",
    "typoResult": "true"
}

# 经过urlencode转码
data = parse.urlencode(formdata).encode(encoding='UTF8')

# 如果Request()方法里的data参数有值,那么这个请求就是POST
# 如果没有,就是Get
request = urllib.request.Request(url, data=data, headers=headers)

print(str(urllib.request.urlopen(request).read(), 'utf-8'))

No4:

ajax

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import urllib.request
from urllib import parse

url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

formdata = {
    "start": "0",
    "limit": "20"
}

data = parse.urlencode(formdata).encode(encoding='utf-8')

request = urllib.request.Request(url, data=data, headers=headers)

print(str(urllib.request.urlopen(request).read(), 'utf-8'))

 No5:

handler

import urllib.request

http_handler = urllib.request.HTTPHandler(debuglevel=1)
opener = urllib.request.build_opener(http_handler)
request = urllib.request.Request("http://www.baidu.com/")
response = opener.open(request)
print(str(response.read(), 'utf-8'))

No6:

proxy

import urllib.request

proxyswitch = True
httpproxy_handler = urllib.request.ProxyHandler({"http": "222.22.66.211"})
nullproxy_handler = urllib.request.ProxyHandler({})

if proxyswitch:
    opener = urllib.request.build_opener(httpproxy_handler)
else:
    opener = urllib.request.build_opener(nullproxy_handler)

urllib.request.install_opener(opener)
request = urllib.request.Request("http://www.baidu.com/")
response = urllib.request.urlopen(request)
print(str(response.read(), 'utf-8'))

No7:

http

import urllib.request

test = "test"
password = "123456"
webserver = "192.168.21.52"
passwordMgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
passwordMgr.add_password(None, webserver, test, password)
httpauth_handler = urllib.request.HTTPBasicAuthHandler(passwordMgr)
opener = urllib.request.build_opener(httpauth_handler)
request = urllib.request.Request("http://" + webserver)
response = opener.open(request)
print(str(response.read(), 'utf-8'))

No8:

cookie

from urllib import request
from urllib import parse
from http import cookiejar

cookie = cookiejar.CookieJar()
cookie_handler = request.HTTPCookieProcessor(cookie)
opener = request.build_opener(cookie_handler)
opener.addheaders = [("User-Agent", "xxx")]
url = "http://www.renren.com/PLogin.do"
data = {"email": "xxx@163.com", "password": "xxx"}
data = parse.urlencode(data).encode(encoding='UTF-8')
request = request.Request(url, data=data)
response = opener.open(request)
print(str(response.read(), 'utf-8'))

No9:

抓取内涵段子

from urllib import request
import re


class Spider:
    def __init__(self):
        self.page = 1
        self.switch = True

    def loadPage(self):
        print("正在下载数据...")
        url = "http://xiaohua.zol.com.cn/new/" + str(self.page) + ".html"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
        req = request.Request(url, headers=headers)
        response = request.urlopen(req)
        html = str(response.read(), "gbk")
        pattern = re.compile('<div\sclass="summary-text">(.*?)</div>', re.S)
        content_list = pattern.findall(html)
        self.dealPage(content_list)

    def dealPage(self, content_list):
        for item in content_list:
            item = item.replace("<p>", "").replace("</p>", "").replace("<br>", "").replace('<p class="bbsp">',
                                                                                           "").replace("&nbsp", "")
            self.writePage(item)

    def writePage(self, item):
        print("正在写入数据...")
        with open("duanzi.txt", "a") as f:
            f.write(item)

    def startWork(self):
        while self.switch:
            self.loadPage()
            command = input("如果继续爬取,请按回车(退出输入quit)")
            if command == "quit":
                self.switch = False
            self.page += 1
        print("谢谢使用!")


if __name__ == "__main__":
    duanzi = Spider()
    duanzi.startWork()

No10:

抓取百度贴吧美女图

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from urllib import request
from urllib import parse
from lxml import etree


def loadPage(url):
    print("正在下载...")
    req = request.Request(url)
    html = request.urlopen(req).read()
    # 解析HTML文档为HTML DOM模型
    content = etree.HTML(html)
    # 返回所有匹配成功的列表集合
    link_list = content.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')

    # link_list = content.xpath('//a[@class="j_th_tit"]/@href')
    for link in link_list:
        fulllink = "http://tieba.baidu.com" + link
        # 组合为每个帖子的链接
        print("link=" + link)
        loadImage(fulllink)


# 取出每个帖子里的每个图片连接
def loadImage(link):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    req = request.Request(link, headers=headers)
    html = request.urlopen(req).read()
    # 解析
    content = etree.HTML(html)
    # 取出帖子里每层层主发送的图片连接集合
    link_list = content.xpath('//img[@class="BDE_Image"]/@src')
    # link_list = content.xpath('//div[@class="post_bubble_middle"]')
    # link_list = content.xpath('//img[@class="BDE_Image"]/@src')
    # 取出每个图片的连接
    for link in link_list:
        print("imglink" + link)
        writeImage(link)


def writeImage(link):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    # 文件写入
    req = request.Request(link, headers=headers)
    # 图片原始数据
    image = request.urlopen(req).read()
    # 取出连接后10位做为文件名
    filename = link[-10:]
    # 写入到本地磁盘文件内,美女
    with open(filename, "wb") as f:
        f.write(image)
        print("已经成功下载 " + filename)


def tiebaSpider(url, beginPage, endPage):
    for page in range(beginPage, endPage + 1):
        pn = (page - 1) * 50
        # filename = "第" + str(page) + "页.html"
        fullurl = url + "&pn=" + str(pn)
        # print fullurl
        loadPage(fullurl)
        # print html

        print("谢谢使用")


if __name__ == "__main__":
    kw = input("请输入需要爬取的贴吧名:")
    beginPage = int(input("请输入起始页:"))
    endPage = int(input("请输入结束页:"))

    url = "http://tieba.baidu.com/f?"
    key = parse.urlencode({"kw": kw})
    fullurl = url + key
    tiebaSpider(fullurl, beginPage, endPage)

No11:

抓取百度图片

# -*- coding: utf-8 -*-
"""根据搜索词下载百度图片"""
import re
import sys
import urllib

import requests


def get_onepage_urls(onepageurl):
    """获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
    if not onepageurl:
        print('已到最后一页, 结束')
        return [], ''
    try:
        html = requests.get(onepageurl)
        html.encoding = 'utf-8'
        html = html.text
    except Exception as e:
        print(e)
        pic_urls = []
        fanye_url = ''
        return pic_urls, fanye_url
    pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
    fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
    fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
    return pic_urls, fanye_url


def down_pic(pic_urls):
    """给出图片链接列表, 下载所有图片"""
    for i, pic_url in enumerate(pic_urls):
        try:
            pic = requests.get(pic_url, timeout=15)
            string = str(i + 1) + '.jpg'
            with open(string, 'wb') as f:
                f.write(pic.content)
                print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
        except Exception as e:
            print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
            print(e)
            continue


if __name__ == '__main__':
    keyword = '中国美女'  # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
    url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
    url_init = url_init_first + urllib.parse.quote(keyword, safe='/')
    all_pic_urls = []
    onepage_urls, fanye_url = get_onepage_urls(url_init)
    all_pic_urls.extend(onepage_urls)

    fanye_count = 0  # 累计翻页数
    while 1:
        onepage_urls, fanye_url = get_onepage_urls(fanye_url)
        fanye_count += 1
        # print('第页' % str(fanye_count))
        if fanye_url == '' and onepage_urls == []:
            break
        all_pic_urls.extend(onepage_urls)

    down_pic(list(set(all_pic_urls)))

 No12:

知乎登录

from bs4 import BeautifulSoup
import requests
import time


def captcha(captcha_data):
    with open("captcha.jpg", "wb") as f:
        f.write(captcha_data)
    text = input("请输入验证码:")
    return text


def zhihuLogin():
    sess = requests.Session()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    html = sess.get("https://www.zhihu.com/#signin", headers=headers).text
    bs = BeautifulSoup(html, "lxml")
    _xsrf = bs.find("input", attrs={"name": "_xsrf"}).get("value")

    captcha_url = "https://www.zhihu.com/captcha.gif?r=%dtype=login" % (time.time() * 1000)
    captcha_data = sess.get(captcha_url, headers=headers).content
    text = captcha(captcha_data)
    data = {
        "_xsrf": _xsrf,
        "email": "",
        "password": "",
        "captcha": text
    }
    response = sess.post("https://www.zhihu.com/login/email", data=data, headers=headers)
    print(response.text)


if __name__ == "__main__":
    zhihuLogin()

No13:

json解析

import urllib.request
import json
import jsonpath

url = "http://www.lagou.com/lbs/getAllCitySearchLabels.json"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
unicodestr = json.loads(html)
city_list = jsonpath.jsonpath(unicodestr, "$..name")
for item in city_list:
    print(item)

array = json.dumps(city_list, ensure_ascii=False)

with open("lagoucity.json", "wb+") as f:
    f.write(array.encode("utf-8"))

No14:

xml解析

# -*- coding:utf-8 -*-

import urllib.request
from lxml import etree
import json

url = "http://www.qiushibaike.com/8hr/page/1/"
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
request = urllib.request.Request(url, headers=headers)
html = urllib.request.urlopen(request).read()
text = etree.HTML(html)

node_list = text.xpath('//div[contains(@id, "qiushi_tag")]')
items = {}
for node in node_list:
    username = node.xpath('./div/a/@title')[0]
    image = node.xpath('.//div[@class="thumb"]//@src')
    content = node.xpath('.//div[@class="content"]/span')[0].text
    zan = node.xpath('.//i')[0].text
    comments = node.xpath('.//i')[1].text

    items = {
        "username": username,
        "image": image,
        "content": content,
        "zan": zan,
        "comments": comments
    }

    with open("qiushi.json", "ab+") as f:
        f.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + b"\n")

No15:

多线程

# -*- coding:utf-8 -*-

import threading
from queue import Queue
from lxml import etree
import requests
import json


class ThreadCrawl(threading.Thread):
    def __init__(self, threadName, pageQueue, dataQueue):
        super(ThreadCrawl, self).__init__()
        self.threadName = threadName
        self.pageQueue = pageQueue
        self.dataQueue = dataQueue
        self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}

    def run(self):
        print("启动" + self.threadName)
        while not CRAWL_EXIT:
            try:
                page = self.pageQueue.get(False)
                url = "http://www.qiushibaike.com/8hr/page/" + str(page) + "/"
                content = requests.get(url, headers=self.headers)
                self.dataQueue.put(content)
            except:
                pass
        print("结束" + self.threadName)


class ThreadParse(threading.Thread):
    def __init__(self, threadName, dataQueue, filename):
        super(ThreadParse, self).__init__()
        self.threadName = threadName
        self.dataQueue = dataQueue
        self.filename = filename

    def run(self):
        while not PARSE_EXIT:
            try:
                html = self.dataQueue.get(False)
                self.parse(html)
            except:
                pass

    def parse(self, html):
        html = etree.HTML(html)

        node_list = text.xpath('//div[contains(@id, "qiushi_tag")]')
        items = {}
        for node in node_list:
            username = node.xpath('./div/a/@title')[0]
            image = node.xpath('.//div[@class="thumb"]//@src')
            content = node.xpath('.//div[@class="content"]/span')[0].text
            zan = node.xpath('.//i')[0].text
            comments = node.xpath('.//i')[1].text

            items = {
                "username": username,
                "image": image,
                "content": content,
                "zan": zan,
                "comments": comments
            }

            with open("qiushi.json", "ab+") as f:
                f.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + b"\n")

CRAWL_EXIT = False
PARSE_EXIT = False


def main():
    pageQueue = Queue(10)
    for i in range(1, 11):
        pageQueue.put(i)

    dataQueue = Queue()

    filename = open("duanzi.json", "a")

    crawList = ["采集线程1号", "采集线程2号", "采集线程3号"]
    threadcrawl = []
    for threadName in crawList:
        thread = ThreadCrawl(threadName, pageQueue, dataQueue)
        thread.start()
        threadcrawl.append(thread)

    parseList = ["解析线程1号", "解析线程2号", "解析线程3号"]
    threadparse = []
    for threadName in parseList:
        thread = ThreadParse(threadName, dataQueue, filename)
        Thread.start()
        threadparse.append(thread)

    while not pageQueue.empty():
        pass

    global CRAWL_EXIT
    CRAWL_EXIT = True

    for thread in threadcrawl:
        thread.join()
        print("1")


if __name__ == "__main__":
    main()

No16:

selenium、webdriver

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.PhantomJS(executable_path=r'D:\phantomjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get("http://www.baidu.com/")

driver.find_element_by_id("kw").send_keys(u"中国美女")
# driver.find_element_by_id("su").click()
driver.find_element_by_id("su").send_keys(Keys.ENTER)
driver.save_screenshot("girl.png")
driver.get_cookies()

print(driver.page_source)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.PhantomJS(executable_path=r'D:\phantomjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')

driver.get("https://www.douban.com/")
driver.find_element_by_name("form_email").send_keys("mr.mao.tony@gmail.com")
driver.find_element_by_name("form_password").send_keys("Mzj60055969alarm")
driver.find_element_by_id("captcha_field").send_keys("short")
driver.find_element_by_class_name("bn-submit").click()
driver.save_screenshot("douban.png")

No17:

unittest测试

from selenium import webdriver
import unittest
from bs4 import BeautifulSoup as bs


class douyu(unittest.TestCase):
    def setUp(self):
        self.driver = webdriver.PhantomJS(executable_path=r'D:\phantomjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')

    def testDouyu(self):
        self.driver.get("https://www.douyu.com/directory/all")
        while True:
            soup = bs(self.driver.page_source, "lxml")
            names = soup.find_all("h3", {"class": "ellipsis"})
            numbers = soup.find_all("span", {"class", "dy-num fr"})
            for name, number in zip(names, numbers):
                print(u"观众人数:" + number.get_text().strip() + u"\t房间名:" + name.get_text().strip())

            if self.driver.page_source.find("shark-pager-disable-next") != -1:
                break
            self.driver.find_element_by_class_name("shark-pager-next").click()

    def tearDown(self):
        self.driver.quit()


if __name__ == "__main__":
    unittest.main()

No18:

执行js

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from selenium import webdriver
import time

driver = webdriver.PhantomJS(executable_path=r'D:\phantomjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')
driver.get("https://movie.douban.com/typerank?type_name=剧情&type=11&interval_id=100:90&action=")

time.sleep(30)
# 向下滚动10000像素
js = "document.body.scrollTop=10000"
#js="var q=document.documentElement.scrollTop=10000"

#查看页面快照
driver.save_screenshot("douban.png")

# 执行JS语句
driver.execute_script(js)
time.sleep(20)

#查看页面快照
driver.save_screenshot("newdouban.png")

driver.quit()

No19:

tesseract 识别图片中文字-验证码

posted @ 2018-12-05 19:32  嘉禾世兴  阅读(475)  评论(0编辑  收藏  举报