import re
import requests
from pyquery import PyQuery as pq
url = 'http://www.bytravel.cn/Scene/mu.html'
def get_page_content(url):
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
print(response.encoding)
response_html = response.text
result = re.findall('(<div id=tctitletop10>.*?</a></div></div></div>)', response_html)
for div in result:
doc = pq(div)
title = doc('#tctitletop102 a img').attr('alt')
if title == None:
title = doc('.f14b').text()
print(title.encode('iso-8859-1').decode('UTF8'))
# GBK 解码不行 就换utf8
get_page_content(url)