python-spider
#无状态请求:啥东西都不给我 #有状态请求:返回一个东西给我(相当于分配一个id给我,浏览器则保存了这个id,第二次请求时不用。。。。) import requests,lxml,re from bs4 import BeautifulSoup while True: page=3 for i in range(page): print("正在爬取第{}页".format(i)) headers={ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'Cookie' : "track_id=64470618111905792; uuid=1f7711a6-2666-4118-ccba-21d079d62a19; antipas=A324J8H7723967677PA9H49713; cityDomain=sz; clueSourceCode=%2A%2300; user_city_id=17; ganji_uuid=8532394690421830647367; sessionid=4e3b40e1-4fe0-49e5-b013-0a6ababc8547; lg=1; lng_lat=114.00978_22.53774; gps_type=1; close_finance_popup=2020-04-10; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22pcbiaoti%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%22-%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22track_id%22%3A%2264470618111905792%22%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%221f7711a6-2666-4118-ccba-21d079d62a19%22%2C%22ca_city%22%3A%22sz%22%2C%22sessionid%22%3A%224e3b40e1-4fe0-49e5-b013-0a6ababc8547%22%7D; preTime=%7B%22last%22%3A1586526224%2C%22this%22%3A1586526193%2C%22pre%22%3A1586526193%7D" }#请求身份证:字典形式 url = 'https://www.guazi.com/sz/buy/o{}/#bread'.format(i) resp = requests.get(url,headers=headers) #print(resp.txt) #返回网页源代码的文本形式 html=resp.content.decode()#返回网页源代码的二进制形式 #服务器反爬机制 #3.解析网页,提取数据 soup=BeautifulSoup(html,'lxml') infos=soup.find('ul',{'class':'carlist clearfix js-top'}).find_all('li') with open(r'D:\Typora\2020-04-06\guazi.csv','a+',encoding='utf-8') as f: for info in infos: cars=info.find('h2').get_text() cars_a=re.sub(r' ',',',cars).split(' ') print(cars_a) years=info.find('div',{'class':'t-i'}).get_text() year=re.sub(r'|','',years).split('|') print(year) try: newprice = info.find('div', {'class': 't-price'}).find('p').get_text() oldprice=info.find('div',{'class':'t-price'}).find('em').get_text() f.write('{},{},{},{}\n'.format(cars_a[0],year[0],newprice,oldprice)) except AttributeError: continue break