欢迎来到武韵的博客

爬取京东商品信息

import re,time,requests,bs4,csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

def get_infotext(ulist, html):
    soup = BeautifulSoup(html, "html.parser")
    das = soup.find(name = 'div',attrs={"id":"J_goodsList"})
    lis = das.find_all('li')
    for li in lis:
        img = li.find(name='div', attrs={"""class""": 'p-img'}).a.img.get('src')
        if img == None:
            img = li.find(name='div', attrs={"""class""": 'p-img'}).a.img.get('data-lazy-img')
        price = li.find(name='div', attrs={"""class""": 'p-price'}).i.string
        name = li.find(name='div', attrs={"""class""": 'p-name'}).a.em.text
        commit = li.find(name='div', attrs={"""class""": 'p-commit'}).a.string
        shopnum = li.find(name='div', attrs={"""class""": 'p-shopnum'}).a.text  # 可能会报错,原因未知
        ulist.append([name,price,img,commit,shopnum])
        # ulist.append([shopnum])
def print_infotext(ulist,num):
    for i in range(num):
        u = ulist[i]
        print(u[0],u[1],u[2],u[3],u[4])

def print_infocsv(ulist):
    with open('D:\pics\jindong.csv','w',newline='') as f:
        writer = csv.writer(f)
        for row in ulist:
            writer.writerow(row)
url = "https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&page=1&s=1&click=0"
exe_data = r'C:\Users\lsk17\AppData\Local\360Chrome\Chrome\Application\360chrome.exe'  # 浏览器根目录所在地
chrome_options = Options()
chrome_options.binary_location = exe_data

browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get(url)
num = 2
uinfo = []
for i in range(0,3):
    length=1000
    for i in range(0,4):
        js ="var q=document.documentElement.scrollTop="+str(length)
        browser.execute_script(js)
        time.sleep(1)
        length+=length
        time.sleep(1)
    get_infotext(uinfo,browser.page_source)
    div = browser.find_element_by_id("J_bottomPage")
    elem = div.find_element_by_class_name("input-txt")
    elem.clear()
    elem.send_keys(num)
    elem.send_keys(Keys.RETURN)
    num += 1
browser.quit()
# print_infotext(uinfo,10)
print_infocsv(uinfo)
# print(uinfo)

 

posted on 2019-11-26 16:36  武韵  阅读(111)  评论(0编辑  收藏  举报

导航