Python:爬虫第一步
"""" title : 爬虫 dateTime : 2020-6-30 """ # from urllib import request # from urllib import request # import urlopen,Request from urllib.request from urllib.request import urlopen, Request # from lxml import etree from lxml import etree import re # import lxml # weburl = "https://bz.zzzmh.cn/#people" weburl = "https://movie.douban.com/top250" def crow(i): global weburl headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'} ret = Request(weburl, headers=headers) res = urlopen(ret) html = res.read().decode('utf-8') # with open("mm.txt", "w", encoding="utf-8") as fp: # fp.write(html) # etree = html.etree html = etree.HTML(html) # datas = html.xpath("") # class="view-body" # pattern01 = r'<a href="(.*?)" title=".*?" class="tit" target="_blank">.*?</a> <span><u><a href="http://www.27270.com/ent/meinvtupian/" title="美女图片">美女图片</a>' datas = html.xpath('//ol[@class="grid_view"]/li') # <ol class="grid_view"> for data in datas: data_title = data.xpath('div/div[2]/div[@class="hd"]/a/span[1]/text()') data_star = data.xpath('div/div[2]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()') data_quote = data.xpath('div/div[2]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()') print(data_title, end=" ,") print(data_star, end=" ,简介:") print(data_quote) crow(1)