某写真网站爬虫
写了一个很粗糙的某写真网站的小爬虫,有空改改
from selenium import webdriver import re import requests from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from pyquery import PyQuery as pq from selenium.webdriver.firefox.options import Options url = 'http://www.tujidao.com/a/?id=25309' PhantomJS_conf = ['--load-images=false','--disk-cache=false'] # 浏览器不加载图片,不开启缓存 options = webdriver.FirefoxOptions() options.add_argument('-headless') browser = webdriver.Firefox(firefox_options=options) # browser = webdriver.PhantomJS(service_args=PhantomJS_conf) # browser.set_window_size(1400,900) # 设置浏览器窗口大小 wait = WebDriverWait(browser,10) def login(): browser.get(url) # 输入账号 int_user = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.layui-form-item:nth-child(1) > div:nth-child(2) > input:nth-child(1)'))) # 输入密码 int_pass = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.layui-form-item:nth-child(2) > div:nth-child(2) > input:nth-child(1)'))) # 登陆按钮 log = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, '.layui-btn'))) int_user.send_keys(int()) int_pass.send_keys(int()) log.click() browser.get(url) return browser.page_source def get_image(): # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.footer'))) html2 = login() doc = pq(html2) item = doc('#kbox img') # 找到ID # print(item) return item def register(): html3 = get_image() c1 = re.compile('<img.*?data-src="(.*?)"/>',re.S) c2 = re.findall(c1,str(html3)) return c2 count = 0 for i in register(): response = requests.get(i) dir = r'C:\Users\admin\Desktop\test\a' # print(response.content) with open(dir+'{}'.format(count)+'.jpg',mode='wb') as f: count += 1 f.write(response.content)