python 爬虫

# -*- coding:utf-8 -*-
__version__ = '1.0.0.0'
"""
@brief : 简介
@details: 详细信息
@author : zhphuang
@date : 2019-02-22
"""
import os
import time
import random
import requests
import urllib.request
from selenium import webdriver
from bs4 import BeautifulSoup



class Spider(object):
"""
爬取类
"""
def __init__(self):
options = webdriver.ChromeOptions()

# 设置chrome浏览器无界面模式
# options.add_argument('--headless')
#options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
#options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2,
# 'profile.default_content_setting_values': {'notifications': 2}}) # 不加载图片
# self.browser = webdriver.PhantomJS(executable_path=self._getdriverpath())
self.browser = webdriver.Chrome(self._getdriverpath(), chrome_options=options)
# self.browser.implicitly_wait(60)

def get_info(self):
pass

def quit(self):
self.browser.close()

def _getdriverpath(self):
# path = os.path.join(os.path.split(__file__)[0], "phantomjs")
path = os.path.join(os.path.split(__file__)[0], "chromedriver")
# path = "C://chromedriver.exe"
return path


class UserNameSpider(Spider):

def __init__(self):
self.agents = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
]
# super(UserNameSpider,self).__init__()

def get_info(self):
page = 2
while page < 10:
url = "https://bbs.jj.cn/forumdisplay.php?fid=173&page=%s" % page

header = {
'path': '/forumdisplay.php?fid=173&page=%s' % page,
'referer': 'https://bbs.jj.cn/',
'User-agent': random.choice(self.agents),
'scheme': "https",
'authority': 'bbs.jj.cn',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'upgrade-insecure-requests': '1',
"cookie": "UM_distinctid=1690eef0bfef2-0211bcc91dff28-36617102-1aeaa0-1690eef0bff6c7; Hm_lvt_65699696998080926ad677627de4c418=1550733742; qDH_visitedfid=173D254; qDH_oldtopics=D6937765D6937910D; Hm_lvt_c22ac8657ced9bd55e529ce7a0e1f7d9=1550835844; Hm_lpvt_c22ac8657ced9bd55e529ce7a0e1f7d9=1550835844; JJFormHashKey=912f18049dae7a13e03df54d7d72ca93; User_Id=733493473; User_Nick=d0c2cad6373333343933343733; FigureId=0; PartnerId=0; UserCookieKey=c937381015d9ce02d9a368faf1047195; u_ltime=1550835920; u_pass=_f200921f20362b33e26d534087c2d174; UserLoginInfo=18502789819%2C1%2C1550835920%2Cfdbf48be798373a6c77749a4283d572b; qDH_jjuid=733493473; qDH_cookietime=2592000; qDH_auth=32c7Ba7mL69GzoDwAp%2BYhulsbpodwzfokm%2FkX76Rlw4X%2Fv0yL7wkzP1XpFRjwUbFyVtNaupGuhztyemXr5sloJanzARf%2FsIW; qDH_sid=r7TFZe; CNZZDATA4054856=cnzz_eid%3D1428032294-1550729010-%26ntime%3D1550835718; qDH_onlineusernum=2801; Hm_lpvt_65699696998080926ad677627de4c418=1550836316"

}
res = requests.get(url, headers=header, timeout=30)
# self.browser.get(url)
bs = BeautifulSoup(res.text, "html.parser")
tbodys = bs.select("table#forum_173 > tbody")
for tbody in tbodys:
link = "https://bbs.jj.cn/viewthread.php?tid=" + tbody.attrs["id"].split("_")[1]
try:
res = requests.get(link, headers=header, timeout=30)
except Exception as e:
continue
bs2 = BeautifulSoup(res.text, "html.parser")
div_list = bs2.select("div#postlist > div ")
for div in div_list:
user_name = div.select("a.user_nick2")[0].text
avatar = div.select("div.avatar > a > img")[0].attrs["src"]
print(user_name, avatar)
urllib.request.urlretrieve(avatar, 'images/%s.jpg' % user_name)
page += 1


if __name__ == '__main__':
UserNameSpider().get_info()


posted @ 2019-02-23 18:45  牛牛码代码  阅读(201)  评论(0编辑  收藏  举报