爬取京东商品信息
import re,time,requests,bs4,csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
def get_infotext(ulist, html):
soup = BeautifulSoup(html, "html.parser")
das = soup.find(name = 'div',attrs={"id":"J_goodsList"})
lis = das.find_all('li')
for li in lis:
img = li.find(name='div', attrs={"""class""": 'p-img'}).a.img.get('src')
if img == None:
img = li.find(name='div', attrs={"""class""": 'p-img'}).a.img.get('data-lazy-img')
price = li.find(name='div', attrs={"""class""": 'p-price'}).i.string
name = li.find(name='div', attrs={"""class""": 'p-name'}).a.em.text
commit = li.find(name='div', attrs={"""class""": 'p-commit'}).a.string
shopnum = li.find(name='div', attrs={"""class""": 'p-shopnum'}).a.text # 可能会报错,原因未知
ulist.append([name,price,img,commit,shopnum])
# ulist.append([shopnum])
def print_infotext(ulist,num):
for i in range(num):
u = ulist[i]
print(u[0],u[1],u[2],u[3],u[4])
def print_infocsv(ulist):
with open('D:\pics\jindong.csv','w',newline='') as f:
writer = csv.writer(f)
for row in ulist:
writer.writerow(row)
url = "https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&page=1&s=1&click=0"
exe_data = r'C:\Users\lsk17\AppData\Local\360Chrome\Chrome\Application\360chrome.exe' # 浏览器根目录所在地
chrome_options = Options()
chrome_options.binary_location = exe_data
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get(url)
num = 2
uinfo = []
for i in range(0,3):
length=1000
for i in range(0,4):
js ="var q=document.documentElement.scrollTop="+str(length)
browser.execute_script(js)
time.sleep(1)
length+=length
time.sleep(1)
get_infotext(uinfo,browser.page_source)
div = browser.find_element_by_id("J_bottomPage")
elem = div.find_element_by_class_name("input-txt")
elem.clear()
elem.send_keys(num)
elem.send_keys(Keys.RETURN)
num += 1
browser.quit()
# print_infotext(uinfo,10)
print_infocsv(uinfo)
# print(uinfo)