数据采集第一次作业
作业报告
作业一:大学排名数据爬取
作业代码和运行结果
作业主要代码:
import urllib.request #urllib.request:用于从互联网获取网页数据。urlopen 方法可以打开一个 URL 并读取网页内容。
from bs4 import BeautifulSoup #BeautifulSoup:这是一个用于解析 HTML 和 XML 文档的 Python 库,html.parser 是内置的 HTML 解析器。BeautifulSoup 提供了一种简单的方法来遍历和提取网页中的数据。
url='http://www.shanghairanking.cn/rankings/bcur/2020'
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8') #将网页内容以 UTF-8 编码的形式读取并存储到 html 变量中。decode('utf-8') 将网页内容从字节形式解码为字符串,方便后续解析
soup = BeautifulSoup(html,'html.parser')#使用 BeautifulSoup 将 HTML 文本进行解析
table =soup.find('table',{'class':'rk-table'})#查找 HTML 中 <table> 元素,要求该表格的 class 属性为 'rk-table'
rows = table.find_all('tr')#在表格中查找所有的 <tr>(表格行)。tr 标签表示表格中的一行,rows 是一个包含所有行的列表,每一行都是一个 <tr> 元素
print("排名\t学校名称\t省市\t学校类型\t总分") #输出表头
for row in rows[1:]: #rows[0] 是表格的标题行(列名),所以这里通过 rows[1:] 来跳过标题行,只处理数据行
cols = row.find_all('td')
rank = cols[0].text.strip()
school_name =cols[1].text.strip()
provience = cols[2].text.strip()
school_type =cols[3].text.strip()
score = cols[4].text.strip()
print(f"{rank}\t{school_name}\t{provience}\t{school_type}\t{score}")#遍历后打印
运行结果:
作业心得:通过这次作业,我学会了利用request方法来访问网页和利用beautifulsoup方法来解析html文档
作业二:商城排重
作业代码和运行结果
作业主要代码:
import requests
from bs4 import BeautifulSoup
import json
import pprint
import csv
import time
import random
import re
fieldnames = ['标题', '价格'] #表示将要保存的字段,分别为商品的标题和价格
# 指定输出文件名称
filename = 'taobao.csv'
# 创建一个 DictWriter 对象
with open(filename, 'w', newline='', encoding='ANSI') as csvfile:
csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
# 写入表头
csvwriter.writeheader()
#淘宝搜索书包的URL
for page in range(1,2):
print("="*10+f'正在爬取第{page}页'+'='*10)
url ="https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?jsv=2.7.2&appKey=12574478&t=1727354883352&sign=a794163d796bfb61b5f5b0eec984e7a6&api=mtop.relationrecommend.wirelessrecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp8&data=%7B%22appId%22%3A%2234385%22%2C%22params%22%3A%22%7B%5C%22device%5C%22%3A%5C%22HMA-AL00%5C%22%2C%5C%22isBeta%5C%22%3A%5C%22false%5C%22%2C%5C%22grayHair%5C%22%3A%5C%22false%5C%22%2C%5C%22from%5C%22%3A%5C%22nt_history%5C%22%2C%5C%22brand%5C%22%3A%5C%22HUAWEI%5C%22%2C%5C%22info%5C%22%3A%5C%22wifi%5C%22%2C%5C%22index%5C%22%3A%5C%224%5C%22%2C%5C%22rainbow%5C%22%3A%5C%22%5C%22%2C%5C%22schemaType%5C%22%3A%5C%22auction%5C%22%2C%5C%22elderHome%5C%22%3A%5C%22false%5C%22%2C%5C%22isEnterSrpSearch%5C%22%3A%5C%22true%5C%22%2C%5C%22newSearch%5C%22%3A%5C%22false%5C%22%2C%5C%22network%5C%22%3A%5C%22wifi%5C%22%2C%5C%22subtype%5C%22%3A%5C%22%5C%22%2C%5C%22hasPreposeFilter%5C%22%3A%5C%22false%5C%22%2C%5C%22prepositionVersion%5C%22%3A%5C%22v2%5C%22%2C%5C%22client_os%5C%22%3A%5C%22Android%5C%22%2C%5C%22gpsEnabled%5C%22%3A%5C%22false%5C%22%2C%5C%22searchDoorFrom%5C%22%3A%5C%22srp%5C%22%2C%5C%22debug_rerankNewOpenCard%5C%22%3A%5C%22false%5C%22%2C%5C%22homePageVersion%5C%22%3A%5C%22v7%5C%22%2C%5C%22searchElderHomeOpen%5C%22%3A%5C%22false%5C%22%2C%5C%22search_action%5C%22%3A%5C%22initiative%5C%22%2C%5C%22sugg%5C%22%3A%5C%22_4_1%5C%22%2C%5C%22sversion%5C%22%3A%5C%2213.6%5C%22%2C%5C%22style%5C%22%3A%5C%22list%5C%22%2C%5C%22ttid%5C%22%3A%5C%22600000%40taobao_pc_10.7.0%5C%22%2C%5C%22needTabs%5C%22%3A%5C%22true%5C%22%2C%5C%22areaCode%5C%22%3A%5C%22CN%5C%22%2C%5C%22vm%5C%22%3A%5C%22nw%5C%22%2C%5C%22countryNum%5C%22%3A%5C%22156%5C%22%2C%5C%22m%5C%22%3A%5C%22pc%5C%22%2C%5C%22page%5C%22%3A1%2C%5C%22n%5C%22%3A48%2C%5C%22q%5C%22%3A%5C%22%25E4%25B9%25A6%25E5%258C%2585%5C%22%2C%5C%22qSource%5C%22%3A%5C%22url%5C%22%2C%5C%22pageSource%5C%22%3A%5C%22a21bo.jianhua%2Fa.201856.d13%5C%22%2C%5C%22tab%5C%22%3A%5C%22all%5C%22%2C%5C%22pageSize%5C%22%3A48%2C%5C%22totalPage%5C%22%3A100%2C%5C%22totalResults%5C%22%3A4800%2C%5C%22sourceS%5C%22%3A%5C%220%5C%22%2C%5C%22sort%5C%22%3A%5C%22_coefp%5C%22%2C%5C%22bcoffset%5C%22%3A%5C%22%5C%22%2C%5C%22ntoffset%5C%22%3A%5C%22%5C%22%2C%5C%22filterTag%5C%22%3A%5C%22%5C%22%2C%5C%22service%5C%22%3A%5C%22%5C%22%2C%5C%22prop%5C%22%3A%5C%22%5C%22%2C%5C%22loc%5C%22%3A%5C%22%5C%22%2C%5C%22start_price%5C%22%3Anull%2C%5C%22end_price%5C%22%3Anull%2C%5C%22startPrice%5C%22%3Anull%2C%5C%22endPrice%5C%22%3Anull%2C%5C%22itemIds%5C%22%3Anull%2C%5C%22p4pIds%5C%22%3Anull%2C%5C%22categoryp%5C%22%3A%5C%22%5C%22%2C%5C%22myCNA%5C%22%3A%5C%22SVV4H1whwCwCAdRrH1X%2B4IOE%5C%22%7D%22%7D"
time.sleep(random.randint(1,4))
# 发送请求
headers = {
'Cookie':'cookie2=1d2daa77cdb3a278ed86071156c4906a; t=1f4797e0007c8ee787c96052a977c5b5; _tb_token_=e3ea388e3ef67; thw=xx; xlly_s=1; wk_cookie2=1dd6127e93d2b80d0107d2ff2b1ce0e5; wk_unb=UUphzWEfsgPtdg3HYQ%3D%3D; mt=ci=0_0; 3PcFlag=1727100957926; cna=SVV4H1whwCwCAdRrH1X+4IOE; unb=2207695484430; lgc=tb790367109; cancelledSubSites=empty; cookie17=UUphzWEfsgPtdg3HYQ%3D%3D; dnk=tb790367109; tracknick=tb790367109; _cc_=VT5L2FSpdA%3D%3D; _l_g_=Ug%3D%3D; sg=902; _nk_=tb790367109; cookie1=AiA2dRGFIGtavVin69Bkr1A72fg%2FOHlC3RTUbcnWMaw%3D; sgcookie=E100YL9vT%2F8dHbjlwEzSlXcuKOg6wn6Btp39dxpZnpYNqnkFiwzhCn9vcvguyI2TUmf4G6c9azyLn3IAFFteuTiL0OYp0QC5RWgNU5ctApfJyKM%3D; havana_lgc2_0=eyJoaWQiOjIyMDc2OTU0ODQ0MzAsInNnIjoiODc1OWJkMzk1MmY1MjAzZTQyNjYxODE4ODM2ZTg3MzQiLCJzaXRlIjowLCJ0b2tlbiI6IjFKQ2d6QllWRTlkWWNGZEhaSlB2OWlnIn0; _hvn_lgc_=0; havana_lgc_exp=1758208473956; cookie3_bak=1d2daa77cdb3a278ed86071156c4906a; cookie3_bak_exp=1727363673956; sn=; uc3=id2=UUphzWEfsgPtdg3HYQ%3D%3D&vt3=F8dD3iVFpmRZBeDoRRo%3D&nk2=F5RCbIxsAmuxIZ4%3D&lg2=Vq8l%2BKCLz3%2F65A%3D%3D; csg=50c85c21; env_bak=FM%2Bgnk3pmrwOstCMsTTEEg5cPUVicMn5jVAKSUDap0LJ; skt=d6adb5262d571065; existShop=MTcyNzEwNDQ3Mw%3D%3D; uc4=nk4=0%40FY4JgmrYSfWwhJOl0WBvpr2qymx6fA%3D%3D&id4=0%40U2grFn5KLJReLldo4nxRaeZ%2BTobZcl8z; miid=5027031754879055512; tk_trace=oTRxOWSBNwn9evfHtXQifmafmYzSU80EW6dedixskcJ%2Bc8J0R27pVpwR009bxUw3pJYMDEOjj9VTZncJdZFFzzCuQZlxUBt4udnYpW6a4ZIJ8ynPKaDn3alD8qVSGYroPSL6BCPTLpH0tVh78VN0PsOpiQ7lNMKjExKPCeIcunssNgUIB7IL4lu2cdst%2FGPNqNYQI0EY2fJDWUfcYEzXLfIXQABN6serFV65yZFUMi7OFuU8p8YoEExAHm%2F%2FuX6i8UdUbe6FtKQNmeluKmFKiQAhd0UAm7xQVQ2KHFcWxM8heNshsHlOZ6cm1avG2%2BJTkBlZc2jyHTaoC9Panpo8mUDoXT9ZXRglrzPCo1r2Ug7zkVvqUC1QhgYc%2BK7LN2CmxuAzv7Yinq1dHUR6IBa67crKob0F7qT46l0iAo38WFgm2Gtg; tkSid=1727350495195_189934807_0.0; mtop_partitioned_detect=1; _m_h5_tk=7a355ff414abbfc210bf911064449df7_1727360937259; _m_h5_tk_enc=ee5b7f970cf3eff8b92728d65567fcb3; _samesite_flag_=true; havana_sdkSilent=1727436899530; sdkSilent=1727437119252; tfstk=gGoEMq2TdHKE2qxbCREz3z4dFrEL4kAXY0N7EYDudWV3OY2u4fcWpWMk9bozsY3BpzOdU9nZggsB9eHl4uZkGItXcveQ2uAbseGfjWe-I7AfThZpKuEkGC_1qyK42j_1JuulSRVTU82hqbqgI-2YqJ4utdPgU-QltbqkQdyuF623r8bgI-2zq7cuqdkiC2GKwsw0-KgjIdMMiBa7ivVNqiWYLyqvD5SlwcyEQSDhogjobJzEVHR3e-aIrxUxAYtPXkMqSl2r2KbzaruoAlmHsn4Tr40bzV1HdjcrIbEIjLb3i2GQjD0VEgDaY5z7LyCe_SmKImEg5nvqIchIpck5E32sG5DKxu-DHkr3tlytVCI3aV0oAyERtQP-bAmnzgzOwRjSw03FZaz3BRPXQd8vULWS5NGzRaQ8Sr2aGpUlyaU3BRPXQd7RyP40QS9Lr; isg=BDw8QZrvdjpH20JiofEsBd5EDdruNeBfJodd7ha9SCcL4dxrPkWw77JTwQmZqRi3',
'Referer': 'https://s.taobao.com/search?_input_charset=utf-8&commend=all&ie=utf8&initiative_id=tbindexz_20170306&localImgKey=&page=1&q=%E4%B9%A6%E5%8C%85&search_type=item&source=suggest&sourceId=tb.index&spm=a21bo.jianhua%2Fa.201856.d13&ssid=s5-e&suggest_query=&tab=all&wq=',
'Sec - Ch - Ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
'Sec - Ch - Ua - Mobile': '?0',
'Sec - Ch - Ua - Platform': '"Windows"',
'Sec - Fetch - Dest': 'script',
'Sec - Fetch - Mode': 'no-cors',
'Sec - Fetch - Site': 'same-site',
'User - Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
} #headers设置:构建请求头,包含Cookie、Referer和User-Agent等信息,模拟浏览器行为以避免被封禁
res = requests.get(url=url,headers=headers) #发送HTTP GET请求获取网页内容
price = r'"SALEPRICE":"(\d+\.\d+)"'#利用正则表达式提取价格
pri_matches = re.findall(price, res.text)
tit = r'"ADGTITLE":"(.*?)(?="|\\n)'#利用正则表达式来提取标题
tit_matches = re.findall(tit, res.text)
for i in range(21):
print("TITLE:"+tit_matches[i],"PRICE:"+pri_matches[i])
dict = {'标题':tit_matches[i],
'价格':pri_matches[i]
}#打印出结果
运行结果:
作业心得:学会利用cookie,user-agent等来构建标头,以模仿浏览器行为以免被封禁。
作业三
作业代码和运行结果
作业代码:
import requests
import re
import time
import random
#保存多页面的url
urls=["https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?jsv=2.7.2&appKey=12574478&t=1728818677222&sign=238a6a2a86e56cc7c4775fd6fed11aa5&api=mtop.relationrecommend.wirelessrecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp8&data=%7B%22appId%22%3A%2234385%22%2C%22params%22%3A%22%7B%5C%22device%5C%22%3A%5C%22HMA-AL00%5C%22%2C%5C%22isBeta%5C%22%3A%5C%22false%5C%22%2C%5C%22grayHair%5C%22%3A%5C%22false%5C%22%2C%5C%22from%5C%22%3A%5C%22nt_history%5C%22%2C%5C%22brand%5C%22%3A%5C%22HUAWEI%5C%22%2C%5C%22info%5C%22%3A%5C%22wifi%5C%22%2C%5C%22index%5C%22%3A%5C%224%5C%22%2C%5C%22rainbow%5C%22%3A%5C%22%5C%22%2C%5C%22schemaType%5C%22%3A%5C%22auction%5C%22%2C%5C%22elderHome%5C%22%3A%5C%22false%5C%22%2C%5C%22isEnterSrpSearch%5C%22%3A%5C%22true%5C%22%2C%5C%22newSearch%5C%22%3A%5C%22false%5C%22%2C%5C%22network%5C%22%3A%5C%22wifi%5C%22%2C%5C%22subtype%5C%22%3A%5C%22%5C%22%2C%5C%22hasPreposeFilter%5C%22%3A%5C%22false%5C%22%2C%5C%22prepositionVersion%5C%22%3A%5C%22v2%5C%22%2C%5C%22client_os%5C%22%3A%5C%22Android%5C%22%2C%5C%22gpsEnabled%5C%22%3A%5C%22false%5C%22%2C%5C%22searchDoorFrom%5C%22%3A%5C%22srp%5C%22%2C%5C%22debug_rerankNewOpenCard%5C%22%3A%5C%22false%5C%22%2C%5C%22homePageVersion%5C%22%3A%5C%22v7%5C%22%2C%5C%22searchElderHomeOpen%5C%22%3A%5C%22false%5C%22%2C%5C%22search_action%5C%22%3A%5C%22initiative%5C%22%2C%5C%22sugg%5C%22%3A%5C%22_4_1%5C%22%2C%5C%22sversion%5C%22%3A%5C%2213.6%5C%22%2C%5C%22style%5C%22%3A%5C%22list%5C%22%2C%5C%22ttid%5C%22%3A%5C%22600000%40taobao_pc_10.7.0%5C%22%2C%5C%22needTabs%5C%22%3A%5C%22true%5C%22%2C%5C%22areaCode%5C%22%3A%5C%22CN%5C%22%2C%5C%22vm%5C%22%3A%5C%22nw%5C%22%2C%5C%22countryNum%5C%22%3A%5C%22156%5C%22%2C%5C%22m%5C%22%3A%5C%22pc%5C%22%2C%5C%22page%5C%22%3A1%2C%5C%22n%5C%22%3A48%2C%5C%22q%5C%22%3A%5C%22%25E4%25B9%25A6%25E5%258C%2585%5C%22%2C%5C%22qSource%5C%22%3A%5C%22url%5C%22%2C%5C%22pageSource%5C%22%3A%5C%22a21bo.jianhua%2Fa.201856.d13%5C%22%2C%5C%22tab%5C%22%3A%5C%22all%5C%22%2C%5C%22pageSize%5C%22%3A48%2C%5C%22totalPage%5C%22%3A100%2C%5C%22totalResults%5C%22%3A4800%2C%5C%22sourceS%5C%22%3A%5C%220%5C%22%2C%5C%22sort%5C%22%3A%5C%22_coefp%5C%22%2C%5C%22bcoffset%5C%22%3A%5C%22%5C%22%2C%5C%22ntoffset%5C%22%3A%5C%22%5C%22%2C%5C%22filterTag%5C%22%3A%5C%22%5C%22%2C%5C%22service%5C%22%3A%5C%22%5C%22%2C%5C%22prop%5C%22%3A%5C%22%5C%22%2C%5C%22loc%5C%22%3A%5C%22%5C%22%2C%5C%22start_price%5C%22%3Anull%2C%5C%22end_price%5C%22%3Anull%2C%5C%22startPrice%5C%22%3Anull%2C%5C%22endPrice%5C%22%3Anull%2C%5C%22itemIds%5C%22%3Anull%2C%5C%22p4pIds%5C%22%3Anull%2C%5C%22p4pS%5C%22%3Anull%2C%5C%22categoryp%5C%22%3A%5C%22%5C%22%2C%5C%22myCNA%5C%22%3A%5C%22SVV4H1whwCwCAdRrH1X%2B4IOE%5C%22%7D%22%7D",
"https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/?jsv=2.7.2&appKey=12574478&t=1728818677222&sign=238a6a2a86e56cc7c4775fd6fed11aa5&api=mtop.relationrecommend.wirelessrecommend.recommend&v=2.0&type=jsonp&dataType=jsonp&callback=mtopjsonp8&data=%7B%22appId%22%3A%2234385%22%2C%22params%22%3A%22%7B%5C%22device%5C%22%3A%5C%22HMA-AL00%5C%22%2C%5C%22isBeta%5C%22%3A%5C%22false%5C%22%2C%5C%22grayHair%5C%22%3A%5C%22false%5C%22%2C%5C%22from%5C%22%3A%5C%22nt_history%5C%22%2C%5C%22brand%5C%22%3A%5C%22HUAWEI%5C%22%2C%5C%22info%5C%22%3A%5C%22wifi%5C%22%2C%5C%22index%5C%22%3A%5C%224%5C%22%2C%5C%22rainbow%5C%22%3A%5C%22%5C%22%2C%5C%22schemaType%5C%22%3A%5C%22auction%5C%22%2C%5C%22elderHome%5C%22%3A%5C%22false%5C%22%2C%5C%22isEnterSrpSearch%5C%22%3A%5C%22true%5C%22%2C%5C%22newSearch%5C%22%3A%5C%22false%5C%22%2C%5C%22network%5C%22%3A%5C%22wifi%5C%22%2C%5C%22subtype%5C%22%3A%5C%22%5C%22%2C%5C%22hasPreposeFilter%5C%22%3A%5C%22false%5C%22%2C%5C%22prepositionVersion%5C%22%3A%5C%22v2%5C%22%2C%5C%22client_os%5C%22%3A%5C%22Android%5C%22%2C%5C%22gpsEnabled%5C%22%3A%5C%22false%5C%22%2C%5C%22searchDoorFrom%5C%22%3A%5C%22srp%5C%22%2C%5C%22debug_rerankNewOpenCard%5C%22%3A%5C%22false%5C%22%2C%5C%22homePageVersion%5C%22%3A%5C%22v7%5C%22%2C%5C%22searchElderHomeOpen%5C%22%3A%5C%22false%5C%22%2C%5C%22search_action%5C%22%3A%5C%22initiative%5C%22%2C%5C%22sugg%5C%22%3A%5C%22_4_1%5C%22%2C%5C%22sversion%5C%22%3A%5C%2213.6%5C%22%2C%5C%22style%5C%22%3A%5C%22list%5C%22%2C%5C%22ttid%5C%22%3A%5C%22600000%40taobao_pc_10.7.0%5C%22%2C%5C%22needTabs%5C%22%3A%5C%22true%5C%22%2C%5C%22areaCode%5C%22%3A%5C%22CN%5C%22%2C%5C%22vm%5C%22%3A%5C%22nw%5C%22%2C%5C%22countryNum%5C%22%3A%5C%22156%5C%22%2C%5C%22m%5C%22%3A%5C%22pc%5C%22%2C%5C%22page%5C%22%3A1%2C%5C%22n%5C%22%3A48%2C%5C%22q%5C%22%3A%5C%22%25E4%25B9%25A6%25E5%258C%2585%5C%22%2C%5C%22qSource%5C%22%3A%5C%22url%5C%22%2C%5C%22pageSource%5C%22%3A%5C%22a21bo.jianhua%2Fa.201856.d13%5C%22%2C%5C%22tab%5C%22%3A%5C%22all%5C%22%2C%5C%22pageSize%5C%22%3A48%2C%5C%22totalPage%5C%22%3A100%2C%5C%22totalResults%5C%22%3A4800%2C%5C%22sourceS%5C%22%3A%5C%220%5C%22%2C%5C%22sort%5C%22%3A%5C%22_coefp%5C%22%2C%5C%22bcoffset%5C%22%3A%5C%22%5C%22%2C%5C%22ntoffset%5C%22%3A%5C%22%5C%22%2C%5C%22filterTag%5C%22%3A%5C%22%5C%22%2C%5C%22service%5C%22%3A%5C%22%5C%22%2C%5C%22prop%5C%22%3A%5C%22%5C%22%2C%5C%22loc%5C%22%3A%5C%22%5C%22%2C%5C%22start_price%5C%22%3Anull%2C%5C%22end_price%5C%22%3Anull%2C%5C%22startPrice%5C%22%3Anull%2C%5C%22endPrice%5C%22%3Anull%2C%5C%22itemIds%5C%22%3Anull%2C%5C%22p4pIds%5C%22%3Anull%2C%5C%22p4pS%5C%22%3Anull%2C%5C%22categoryp%5C%22%3A%5C%22%5C%22%2C%5C%22myCNA%5C%22%3A%5C%22SVV4H1whwCwCAdRrH1X%2B4IOE%5C%22%7D%22%7D"
]
# 初始化图片URL列表
images_urls = []
page = 1
# 假设我们爬取前3页
for url in urls:
# 随机等待时间,模拟正常用户行为1去·1·
time.sleep(random.randint(2, 5))
print("=" * 10 + f'正在爬取第{page}页' + '=' * 10)
page = page+1
# 发送请求
headers = {
'Cookie':"wk_cookie2=1dd6127e93d2b80d0107d2ff2b1ce0e5; wk_unb=UUphzWEfsgPtdg3HYQ%3D%3D; cna=SVV4H1whwCwCAdRrH1X+4IOE; lgc=tb790367109; cancelledSubSites=empty; tracknick=tb790367109; _hvn_lgc_=0; sn=; arms_uid=d532aa5b-0ed2-4463-a235-a4cb2cddf3bd; miid=5027031754879055512; cookie2=4adf863e5e9f6b9a66e1b60ba187a734; _samesite_flag_=true; havana_lgc2_0=eyJoaWQiOjIyMDc2OTU0ODQ0MzAsInNnIjoiODBhNTNhNTk3MjQyMjQzNGRlNTc4MzA2ZmFkZTE4NWUiLCJzaXRlIjowLCJ0b2tlbiI6IjFKQ2d6QllWRTlkWWNGZEhaSlB2OWlnIn0; unb=2207695484430; _l_g_=Ug%3D%3D; xlly_s=1; sgcookie=E1008xhRKogmDOgF93k159fKv1Isw83NWd0kLxvhRk98i%2Ftm1Yu%2FrF%2FvgFZnEZlK%2Bc9vPgfoxh%2BYwHIewfqieshRfMJ%2BxvJ%2FLs404LeEb0TlD54%3D; havana_lgc_exp=1759825773890; uc1=cookie15=V32FPkk%2Fw0dUvg%3D%3D&existShop=false&cookie14=UoYcC%2FMz2rQw0w%3D%3D&cookie21=V32FPkk%2FgPzW&cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&pas=0; uc3=id2=UUphzWEfsgPtdg3HYQ%3D%3D&vt3=F8dD37nib3oP3C1%2F9Zg%3D&nk2=F5RCbIxsAmuxIZ4%3D&lg2=VT5L2FSpMGV7TQ%3D%3D; csg=33133cc1; cookie17=UUphzWEfsgPtdg3HYQ%3D%3D; dnk=tb790367109; skt=fe76e31dc6e36fc5; existShop=MTcyODcyMTc3Mw%3D%3D; uc4=nk4=0%40FY4JgmrYSfWwhJOl0WBvqRflE53sqA%3D%3D&id4=0%40U2grFn5KLJReLldo4nxRaeZxpGCZYEyU; _cc_=W5iHLLyFfA%3D%3D; sg=902; _nk_=tb790367109; cookie1=AiA2dRGFIGtavVin69Bkr1A72fg%2FOHlC3RTUbcnWMaw%3D; _tb_token_=33e6b660d668e; sdkSilent=1728896789329; havana_sdkSilent=1728896789329; JSESSIONID=AFD2BFC71AB3684F3977B2772C7381A9; ariaDefaultTheme=undefined; mtop_partitioned_detect=1; _m_h5_tk=225b73be62420318cf8c71bea694a296_1728827208694; _m_h5_tk_enc=89f021c2da1fa1430fb58bdbe10e22ab; tfstk=gB2EgtitA9BedDWjfcHr05-ZChDKHvbfL8gSquqoA20nRumoUP45v2ZlJzyrSuFCvb_p4HeauL9CJ6EkUYMllZ6fhknIeYbXT0JMVBnmqTikEYqdRhDPCZ6fhkfnvYZGlJaU3sgIcLDotbcGIVnXK04oZdqiW0nk-0vhbl0tS0vkEYmiIDnvxL4oEGriW0DMmcmCVDhhchSoWjLMofu0xVJk33oKtGEeNKJ4QDozEH06EDwZYXu0YJnPmGigCRlj13_ZIoF_-ckVLEnUsokn48IDa4lzpAoz6wRQuynujfzCVQnaau2xpx-lKlkE8j2i9TSYolruGfy1mtHEK2VSputVRlyURkeaV3AiLvNZgJkcHe071oyr48QRQrruD7caUeSPNUnMFze8eTdrtcnZlGSwzEspk5O_zXOJwXe-bqsMeQd-tcnZlGSwwQhLecufjLC..; isg=BPb2DHQAjUoREXmvo8jR7eaCRyz4FzpRgk7NbWDfeVmho5M9yqfMYEQRv3nPCzJp",
'Referer':'https://s.taobao.com/search?_input_charset=utf-8&commend=all&ie=utf8&initiative_id=tbindexz_20170306&localImgKey=&page=5&q=%E4%B9%A6%E5%8C%85&search_type=item&source=suggest&sourceId=tb.index&spm=a21bo.jianhua%2Fa.201856.d13&ssid=s5-e&suggest_query=&tab=all&wq=',
'User-Agent':"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Mobile Safari/537.36"
}
response = requests.get(url=url, headers=headers)
# 检查响应状态码
if response.status_code == 200:
# 使用正则表达式提取图片URL
pattern = r'"pic_path":"(.+?)"'
images = re.findall(pattern, response.text)
# 将提取的图片URL添加到列表中
images_urls.extend(images)
# 将图片URL写入到images_url.txt文件中
with open('images_url.txt', 'w', encoding='utf-8') as file:
for image_url in images_urls:
file.write(image_url + '\n')
print("图片URL已保存到images_url.txt文件中。")#先利用正则表达式提取出图片的url,并把其写入txt文件中
import re
import os
import requests
import time
import random
# 确保image文件夹存在
if not os.path.exists('image'):
os.makedirs('image')
# 初始化图片序号
image_index = 1
headers={
"User-Agent":'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Mobile Safari/537.36',
"referer":"'https://s.taobao.com/search?_input_charset=utf-8&commend=all&ie=utf8&initiative_id=tbindexz_20170306&localImgKey=&page=5&q=%E4%B9%A6%E5%8C%85&search_type=item&source=suggest&sourceId=tb.index&spm=a21bo.jianhua%2Fa.201856.d13&ssid=s5-e&suggest_query=&tab=all&wq='"
}
#读取文件内容
# 使用with语句打开文件,这样可以确保文件在读取完毕后会被正确关闭
with open('images_url.txt', 'r', encoding='utf-8') as file:
# 逐行读取文件
for line in file:
# 随机等待时间,模拟正常用户行为
time.sleep(random.randint(2, 5))
# 移除行尾的换行符并添加到列表中
ima_url=line.strip()
# 发送HTTP请求获取图片
try:
response = requests.get(url=ima_url,headers=headers)
# 检查请求是否成功
if response.status_code == 200:
# 为图片生成文件名(按顺序命名)
filename = f"image_{image_index:04d}.jpg"
filepath = os.path.join('image', filename)
# 以二进制写入模式打开文件
with open(filepath, 'wb') as file:
for chunk in response.iter_content(1024):
file.write(chunk)
print(f"图片已保存至:{filepath}")
image_index += 1 # 增加图片序号
else:
print(f"下载图片失败,状态码:{response.status_code}")
except requests.RequestException as e:
print(f"下载图片时发生错误:{e}")
print("所有图片下载完成。")#访问上面程序生成的txt文件,通过读取没张照片的url下载下来到image文件中
运行结果: