爬虫学习项目

项目1:

基于搜狗微信公众号的关键字搜索

from selenium import webdriver
import os
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

DRIVER_PATH = os.path.join(os.path.dirname(__file__), 'chromedriver.exe')
opt = webdriver.ChromeOptions()
opt.headless = True
opt.add_argument('User-Agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
                 ' Chrome/63.0.3239.132 Safari/537.36')
webphjs = webdriver.Chrome(executable_path=DRIVER_PATH, options=opt)
webphjs.get('https://weixin.sogou.com/')
_input = webphjs.find_element_by_id('query')
search = input('请输入公众号关键字:')
_input.send_keys(search)
webphjs.find_element_by_class_name('swz2').click()
while 1:
    soup = BeautifulSoup(webphjs.page_source, 'lxml')
    try:
        new_list = soup.find('ul', {'class': 'news-list2'}).find_all('li')
        for i in new_list:
            title = i.find('p', {'class': 'tit'}).find('a').text
            wxid = i.find('p', {'class': 'info'}).find('label').text
            jt = {}
            for item in i.find_all('dl'):
                key = item.find('dt')
                [s.extract() for s in key('script')]
                key = key.text.replace('\n', '')
                value = item.find('a') or item.find('dd')
                value = value.text.replace('\n', '')
                jt[key] = value
            print('-'*20)
            print('公众号名称:{}'.format(title))
            print('微信号:{}'.format(wxid))
            for k, v in jt.items():
                print('{} {}'.format(k, v))
        locator = (By.ID, 'sogou_next')
        try:
            ele = WebDriverWait(webphjs, 3).until(EC.presence_of_element_located(locator))
            webphjs.find_element_by_id('sogou_next').click()
        except:
            break
    except:
        print('查询不到有关于此关键字的内容')
        break
webphjs.close()
1.000000

设置Chrome为无头浏览器,可能会出现访问异常,被网页发现是自动测试软件,所以给浏览器加了一个User-Agent进行伪装。

 项目2:

西刺代理

from bs4 import BeautifulSoup
import requests
import http.client
import threading

inFile = open('proxy.txt')
outFile = open('verified.txt', 'w')
lock = threading.Lock()


def getProxyList(targeturl="http://www.xicidaili.com/nn/"):
    countNum = 0
    proxyFile = open('proxy.txt', 'a')

    requestHeader = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}

    for page in range(1, 10):
        url = targeturl + str(page)
        req = requests.get(url, headers=requestHeader)
        soup = BeautifulSoup(req.text, "html.parser")
        trs = soup.find('table', id='ip_list').find_all('tr')
        for tr in trs[1:]:
            tds = tr.find_all('td')
            if tds[0].find('img') is None:
                nation = '未知'
                locate = '未知'
            else:
                nation = tds[0].find('img')['alt'].strip()
                locate = tds[3].text.strip()
            ip = tds[1].text.strip()
            port = tds[2].text.strip()
            anony = tds[4].text.strip()
            protocol = tds[5].text.strip()
            speed = tds[6].find('div')['title'].strip()
            time = tds[8].text.strip()

            proxyFile.write('%s|%s|%s|%s|%s|%s|%s|%s\n' % (nation, ip, port, locate, anony, protocol, speed, time))
            print('%s=%s:%s' % (protocol, ip, port))
            countNum += 1

    proxyFile.close()
    return countNum


def verifyProxyList():
    '''
    验证代理的有效性
    '''
    requestHeader = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
    myurl = 'http://www.baidu.com/'

    while True:
        lock.acquire()
        ll = inFile.readline().strip()
        lock.release()
        if len(ll) == 0:
            break
        line = ll.split('|')
        ip = line[1]
        port = line[2]

        try:
            conn = http.client.HTTPConnection(ip, port, timeout=5.0)
            conn.request(method='GET', url=myurl, headers=requestHeader)
            res = conn.getresponse()
            lock.acquire()
            print("+++Success:" + ip + ":" + port)
            outFile.write(ll + "\n")
            lock.release()
        except:
            print("---Failure:" + ip + ":" + port)


if __name__ == '__main__':
    tmp = open('proxy.txt', 'w')
    tmp.write("")
    tmp.close()

    proxynum = getProxyList("http://www.xicidaili.com/nn/")
    print(u"国内高匿:" + str(proxynum))
    proxynum = getProxyList("http://www.xicidaili.com/nt/")
    print(u"国内透明:" + str(proxynum))
    proxynum = getProxyList("http://www.xicidaili.com/wn/")
    print(u"国外高匿:" + str(proxynum))
    proxynum = getProxyList("http://www.xicidaili.com/wt/")
    print(u"国外透明:" + str(proxynum))

    print(u"\n验证代理的有效性:")

    all_thread = []
    for i in range(30):
        t = threading.Thread(target=verifyProxyList)
        all_thread.append(t)
        t.start()

    for t in all_thread:
        t.join()

    inFile.close()
    outFile.close()
    print("All Done.")
xici

 

posted @ 2019-07-17 20:08  爱学习的红领巾  阅读(214)  评论(0编辑  收藏  举报