爬虫_豆瓣全部正在热映电影 (xpath)
单纯地练习一下xpath
1 import requests 2 from lxml import etree 3 4 5 def get_url(url): 6 html = requests.get(url) 7 return html.text 8 9 10 def parse_html(html): 11 informations = [] 12 html_element = etree.HTML(html) 13 ul = html_element.xpath('//ul[@class="lists"]')[0] 14 for li in ul: 15 href = li.xpath('.//li[@class="poster"]/a/@href') 16 title = li.xpath('normalize-space(.//li[@class="stitle"]/a/@title)') #normalize-space 去掉换行符 17 mark = li.xpath('.//span[@class="subject-rate"]/text()') 18 actor = li.xpath('@data-actors') 19 director = li.xpath('@data-director') 20 # print(etree.tostring(uls, encoding='utf-8').decode('utf-8')) 21 information = { 22 'href': href, 23 'title': title, 24 'mark': mark, 25 'actors': actor, 26 'director': director 27 } 28 informations.append(information) 29 30 print(informations) 31 32 33 def main(): 34 url = 'https://movie.douban.com/cinema/nowplaying/beijing/' 35 html = get_url(url) 36 parse_html(html) 37 38 39 if __name__ == '__main__': 40 main()