import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import re,os
from hashlib import md5
def get_page(page_num,search_id):
param1={
'dvpf': 'pc',
'source':'input',
'keyword': '街拍'
}
param2 = {
'keyword': '街拍',
'pd': 'synthesis',
'source': 'pagination',
'dvpf': 'pc',
'aid': 4916,
'page_num': page_num,
'search_id': search_id
}
if page_num == 0:
param=param1
else:
param = param2
url = 'https://so.toutiao.com/search?' + urlencode(param)
try:
reponse = requests.get(url)
if reponse.status_code == 200:
return reponse.text
except Exception as e:
print('ERROR1:', e)
def parse_pg(html):
doc=pq(html)
imgs=doc('.abs-fill img').items()
for img in imgs:
src=img.attr('src')
print(src)
yield src
def save_img(img):
if not os.path.exists(r'D:\pycharm_projects\街拍'):
os.mkdir(r'D:\pycharm_projects\街拍')
try:
response=requests.get(img)
if response.status_code ==200:
file_path='{}/{}.{}'.format(r'D:\pycharm_projects\街拍',md5(response.content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(response.content)
else:
print('alredy download')
except Exception as e:
print('ERROR2:',e)
def main():
search_id=''
for i in range(2):
if i == 0:
html = get_page(i, search_id)
doc = pq(html)
search_id = re.search(r'search_id=(.*)&?',doc('.result-content:last-child a:first-child').attr('href')).group(1)
else:
html = get_page(i, search_id)
imgs=parse_pg(html)
for img in imgs:
print(img)
save_img(img)
if __name__ == '__main__':
main()