爬虫学习项目
项目1:
基于搜狗微信公众号的关键字搜索
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
from selenium import webdriver import os from bs4 import BeautifulSoup from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By DRIVER_PATH = os.path.join(os.path.dirname(__file__), 'chromedriver.exe') opt = webdriver.ChromeOptions() opt.headless = True opt.add_argument('User-Agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/63.0.3239.132 Safari/537.36') webphjs = webdriver.Chrome(executable_path=DRIVER_PATH, options=opt) webphjs.get('https://weixin.sogou.com/') _input = webphjs.find_element_by_id('query') search = input('请输入公众号关键字:') _input.send_keys(search) webphjs.find_element_by_class_name('swz2').click() while 1: soup = BeautifulSoup(webphjs.page_source, 'lxml') try: new_list = soup.find('ul', {'class': 'news-list2'}).find_all('li') for i in new_list: title = i.find('p', {'class': 'tit'}).find('a').text wxid = i.find('p', {'class': 'info'}).find('label').text jt = {} for item in i.find_all('dl'): key = item.find('dt') [s.extract() for s in key('script')] key = key.text.replace('\n', '') value = item.find('a') or item.find('dd') value = value.text.replace('\n', '') jt[key] = value print('-'*20) print('公众号名称:{}'.format(title)) print('微信号:{}'.format(wxid)) for k, v in jt.items(): print('{} {}'.format(k, v)) locator = (By.ID, 'sogou_next') try: ele = WebDriverWait(webphjs, 3).until(EC.presence_of_element_located(locator)) webphjs.find_element_by_id('sogou_next').click() except: break except: print('查询不到有关于此关键字的内容') break webphjs.close()
设置Chrome为无头浏览器,可能会出现访问异常,被网页发现是自动测试软件,所以给浏览器加了一个User-Agent进行伪装。
项目2:
西刺代理
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
from bs4 import BeautifulSoup import requests import http.client import threading inFile = open('proxy.txt') outFile = open('verified.txt', 'w') lock = threading.Lock() def getProxyList(targeturl="http://www.xicidaili.com/nn/"): countNum = 0 proxyFile = open('proxy.txt', 'a') requestHeader = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"} for page in range(1, 10): url = targeturl + str(page) req = requests.get(url, headers=requestHeader) soup = BeautifulSoup(req.text, "html.parser") trs = soup.find('table', id='ip_list').find_all('tr') for tr in trs[1:]: tds = tr.find_all('td') if tds[0].find('img') is None: nation = '未知' locate = '未知' else: nation = tds[0].find('img')['alt'].strip() locate = tds[3].text.strip() ip = tds[1].text.strip() port = tds[2].text.strip() anony = tds[4].text.strip() protocol = tds[5].text.strip() speed = tds[6].find('div')['title'].strip() time = tds[8].text.strip() proxyFile.write('%s|%s|%s|%s|%s|%s|%s|%s\n' % (nation, ip, port, locate, anony, protocol, speed, time)) print('%s=%s:%s' % (protocol, ip, port)) countNum += 1 proxyFile.close() return countNum def verifyProxyList(): ''' 验证代理的有效性 ''' requestHeader = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"} myurl = 'http://www.baidu.com/' while True: lock.acquire() ll = inFile.readline().strip() lock.release() if len(ll) == 0: break line = ll.split('|') ip = line[1] port = line[2] try: conn = http.client.HTTPConnection(ip, port, timeout=5.0) conn.request(method='GET', url=myurl, headers=requestHeader) res = conn.getresponse() lock.acquire() print("+++Success:" + ip + ":" + port) outFile.write(ll + "\n") lock.release() except: print("---Failure:" + ip + ":" + port) if __name__ == '__main__': tmp = open('proxy.txt', 'w') tmp.write("") tmp.close() proxynum = getProxyList("http://www.xicidaili.com/nn/") print(u"国内高匿:" + str(proxynum)) proxynum = getProxyList("http://www.xicidaili.com/nt/") print(u"国内透明:" + str(proxynum)) proxynum = getProxyList("http://www.xicidaili.com/wn/") print(u"国外高匿:" + str(proxynum)) proxynum = getProxyList("http://www.xicidaili.com/wt/") print(u"国外透明:" + str(proxynum)) print(u"\n验证代理的有效性:") all_thread = [] for i in range(30): t = threading.Thread(target=verifyProxyList) all_thread.append(t) t.start() for t in all_thread: t.join() inFile.close() outFile.close() print("All Done.")