CSDN博客地址

Python-爬取京东网站商品信息,并写入excel

from retrying import retry
import requests
from lxml import etree
import time
import os


base_url = "https://search.jd.com/Search?keyword=手机华为&enc=utf-8"
if os.path.exists("JD.xlsx"):
    os.remove("JD.xlsx")

def func():
    return "请求失败"

@retry(stop_max_attempt_number=7, retry_on_exception=func)
def send_resp(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
    resp = requests.get(url, headers=headers)
    return resp

def JD():
    # 商品名称
    titles = []
    # 商品价格
    prices = []
    # 店铺名称
    shop_names = []
    # 图片路径
    img_urls = []
    i = 1
    while True:
        time.sleep(1)
        global base_url
        print(f"正在爬取链接:{base_url}")
        resp = send_resp(base_url)
        response = resp.content.decode()
        html = etree.HTML(response)
        if not html.xpath('//div[@id="J_goodsList"]'):
            break
        for content in html.xpath('//div[@id="J_goodsList"]/ul/li'):
            title = content.xpath(".//div[@class='p-name p-name-type-2']/a/em/text()")[0].split()
            price = content.xpath(".//div[@class='p-price']/strong/i/text()")[0]
            try:
                shop_name = content.xpath(".//div[@class='p-shop']/span/a/text()")[0]
            except:
                shop_name = "厂商配送"
            img_url = "http:" + content.xpath(".//div[@class='p-img']/a/img/@src")[0]
            titles.append(title)
            prices.append(price)
            shop_names.append(shop_name)
            img_urls.append(img_url)
        base_url = f"https://search.jd.com/Search?keyword=手机华为&enc=utf-8&page={i}"
        i += 1
    return titles, prices, shop_names, img_urls

def main():
    titles, prices, shop_names, img_urls = JD()
    df = pd.DataFrame({'标题': titles, '商品价格': prices, '商铺名称': shop_names, "图片链接地址": img_urls})
    df.to_excel("JD.xlsx", sheet_name="商品", index=False)

if __name__ == '__main__':
    main()

excel文件如下:
在这里插入图片描述

posted @ 2020-05-22 17:46  Yi_warmth  阅读(671)  评论(0编辑  收藏  举报
CSDN博客地址