Python爬取淘宝商品信息写入mysql

直接上代码:(商品名称、单价、图片链接)

import pymysql
import requests
import re


def getHTMLText(url):
    kv = {'cookie':'thw=cn; hng=CN%7Czh-CN%7CCNY%7C156; enc=ooWAQ8HPiBkBlDgWaQ2BoQXFD4cHXejeOP0Nq7xvbCuGN5yubT%2ByBjrb2j417KSrQkoR9YQxMFoqYufejy7Hlw%3D%3D; _m_h5_tk=9cc0be22588c97655e9e0ed031f29703_1589472803622; _m_h5_tk_enc=8fd3fcd9077f0f17bcb2dc4f9d593617; cookie2=1a0da2cc9535ebe4f7bd2787bebb9da1; t=0a472589a79eda4e33e9b072e3446525; _tb_token_=e136e0330e37e; alitrackid=www.taobao.com; _samesite_flag_=true; cna=cAVGFNpuDkUCAXkcRV0prgNa; sgcookie=EGxNSorLw1t5Dg21WTFJw; unb=3361002229; uc3=nk2=qA%2Fo8e0UjX1l%2BUs%3D&lg2=UtASsssmOIJ0bQ%3D%3D&id2=UNN78Eg15kheYA%3D%3D&vt3=F8dBxGZobO%2BfXgtBG40%3D; csg=228d2d4a; lgc=%5Cu5E05%5Cu6C14%5Cu5CF0happy; cookie17=UNN78Eg15kheYA%3D%3D; dnk=%5Cu5E05%5Cu6C14%5Cu5CF0happy; skt=aa63ca1a4a6e356c; existShop=MTU4OTYzNTEyOQ%3D%3D; uc4=nk4=0%40qjS8tzpCQQHfNZapqmDNrmd4%2F2Dhnw%3D%3D&id4=0%40UgQz06zOiEqwpAtViK7HqZlIKslx; tracknick=%5Cu5E05%5Cu6C14%5Cu5CF0happy; _cc_=U%2BGCWk%2F7og%3D%3D; _l_g_=Ug%3D%3D; sg=y99; _nk_=%5Cu5E05%5Cu6C14%5Cu5CF0happy; cookie1=UNJSu1S2nK7AhsBSrVKq4Nd7T4K1fH40ygcHPrTYWeA%3D; lastalitrackid=login.taobao.com; tfstk=ccVGB7cFtRk12ge_PFG_ovbuN0aGaiXZfSPUT5GPww8ivnNE7sYkLLSdMwmTjSpf.; uc1=cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&cookie21=VT5L2FSpccLuJBreK%2BBd&cookie15=UIHiLt3xD8xYTw%3D%3D&existShop=false&pas=0&cookie14=UoTUM2YYf7HXaw%3D%3D; mt=ci=21_1; v=0; JSESSIONID=EC27C014A7BFB337D51D380F31E05C14; l=eBTRDRMIq3U2_UibBOfwourza77OSIRAguPzaNbMiOCPO_fp5GiGWZb3Hg89C3GVh67HR3J_JUNTBeYBqIv4n5U62j-la_kmn; isg=BA4O1D8B3Mb76GvkyGwHSEkCX-TQj9KJQPZkSjhXepHMm671oB8imbRZ08f3g8qh',
          'user-agent':'Mozilla/5.0'}
    try:
        r = requests.get(url, headers=kv,timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        #print(r.text)
        return r.text
    except:
        return ""


def parsePage(ilt, html):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
        pic=re.findall(r'\"pic_url\"\:\".*?\"',html)
        k=0
        kind="甘果类零食"
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            img="https:"+eval(pic[i].split(':')[1])
            oldprice=price.replace('1','4')
            ilt.append([k,title,price,oldprice,100,img,kind])
    except:
        print("")


def printGoodsList(ilt):
    db = pymysql.connect("localhost", "root", "511924", "summerperiod", charset='utf8')
    cursor = db.cursor()
    sql_cixian = "INSERT INTO food values (%s,%s,%s,%s,%s,%s,%s)"
    cursor.executemany(sql_cixian, ilt)
    db.commit()
    db.close()

    tplt = "{:4}\t{:8}\t{:16}\t{:16}"
    print(tplt.format("序号", "价格", "商品名称","商品图片"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tplt.format(count, g[0], g[1],g[2]))


def main():
    goods = '甘果类零食'
    depth = 2
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44 * i)
            html = getHTMLText(url)
            parsePage(infoList, html)
        except:
            continue
    printGoodsList(infoList)


main()

 

 更改后缀和链接就可以爬取你想要的商品。(这里是以食品为例)

posted @ 2021-01-19 12:25  喜欢爬的孩子  阅读(463)  评论(0编辑  收藏  举报