获取动态IP
import requests import re import lxml.html class Exam_spider: def __init__(self): self.base_url = 'http://datamining.comratings.com/exam' self.s = requests.session() def down_first(self): """ 进行第一次访问 :return: sessionid """ res = self.s.get(self.base_url) sessionid = res.cookies.get_dict().get('session') return sessionid def down_second(self, cookie): """ 进行第二次访问 :param cookie: 访问需要的完整cookie :return: 响应结果 """ res = self.s.get(self.base_url + '3', cookies=cookie) return res.content def f1(self, a): """ 获得js动态加载的cookie :param a: 第一次访问获得到的cookie中的sessionid :return: js动态加载的cookie """ encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" length = len(a) i = 0 b = "" while i < length: c = ord(a[i]) & 0xff i += 1 if i == length: b += encoderchars[c >> 2] b += encoderchars[(c & 0x3) << 4] b += "==" break c2 = ord(a[i]) i += 1 if i == length: b += encoderchars[c >> 2] b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)] b += encoderchars[(c2 & 0xf) << 2] b += "=" break c3 = ord(a[i]) i += 1 b += encoderchars[c >> 2] b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)] b += encoderchars[((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6)] b += encoderchars[c3 & 0x3f] return b def make_cookie(self, sessionid): """ 获得完整的cookie :param sessionid: 第一访问得到的sessionid :return: 完整的cookie """ lt = [] lt.append("session=" + sessionid + ';') lt.append("c1=" + self.f1(sessionid[1:4]) + ';') lt.append("c2=" + self.f1(sessionid)) cookie = { 'Cookie': " ".join(lt) } return cookie def save_result(self, result): """ 将结果保存进文件中 :param result: 第二次访问的响应结果 :return: """ with open('example_spider_result.html', 'wb') as fp: fp.write(result) def analysis_content(self, result): """ 解析文件,得到ip :param result: :return: """ test_data = result.decode('utf-8') pattern = re.compile(r'\.([A-Z]+){display:none}') class_none_list = pattern.findall(test_data) pattern_div = re.compile('<div\s.*') t = pattern_div.sub("", test_data) pattern_span_none = re.compile('<span\sstyle="display:none">.*?</span>') t1 = pattern_span_none.sub("", t) pattern_class_none1 = re.compile('<span\sclass="' + class_none_list[0] + '">.*</span>') t2 = pattern_class_none1.sub("", t1) pattern_class_none2 = re.compile('<span\sclass="' + class_none_list[1] + '">.*</span>') t3 = pattern_class_none2.sub("", t2) html = lxml.html.fromstring(t3.replace("\n", "")) html_data = html.xpath('//body/descendant-or-self::text()') tt = "" ln = [] for i in html_data[1:]: if tt.count('.') == 3 and tt[-1] != '.': ln.append(tt) tt = "" tt = tt + i ln.append(tt) print(ln) print(len(ln)) def run(self): """ 运行主线程 :return: """ sesionid = self.down_first() cookie = self.make_cookie(sesionid) result = self.down_second(cookie) self.analysis_content(result) self.save_result(result) if __name__ == '__main__': e = Exam_spider() e.run()