python + appium +fiddler 抓取今日头条

fiddler 获取response中json内容,在fiddler scrpit中 OnBeforeResponse函数内加以下代码

	//过滤无关请求,只关注特定请求
		if (oSession.fullUrl.Contains("所需网站"))
		{
			oSession.utilDecodeResponse();
			//消除保存的请求可能存在乱码的情况
			var fso;
			var file;
			fso = new ActiveXObject("Scripting.FileSystemObject");
			//文件保存路径,可自定义
			file = fso.OpenTextFile("文本路径",8 ,true);
			//file.writeLine("Request-url:" + oSession.url);
			file.writeLine("http://"+oSession.url)
			file.writeLine("Request-body:" + oSession.GetResponseBodyAsString());
			//file.writeLine("\n");
			file.close();
		}
class TouTiao():
    #初始化appium自动化配置属性
    def __init__(self):
        # 设备信息
        desired_caps = { 
        xxxx} self.driver = webdriver.Remote("http://127.0.0.1:4723/wd/hub", desired_caps) time.sleep(3) self.driver.find_element_by_xpath('//android.view.View[@content-desc="苏州"]').click() #点击苏州 time.sleep(2) self.driver.swipe(100, 1300, 100, 800, 1000) #滑动去除干扰 time.sleep(1)

#判断元素是否存在 def isElement(self, identifyBy, c): flag = None try: if identifyBy == "xpath": self.WebDriverWait(self.driver, timeout=10, poll_frequency=0.5).until( lambda x: x.find_element_by_xpath(c), message='xpath定位超时') elif identifyBy == "id": self.driver.find_element_by_id(c) flag = True except NoSuchElementException as e: flag = False finally: return flag #时间转换 def parse_time(self, s_time): result_time = '' # 2018-09-29 04:29 if re.findall(r'\d{1,4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}', s_time): result_time = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(s_time, "%Y-%m-%d %H:%M:%S")) elif re.findall(r'\d{1,4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}', s_time): result_time = time.strftime("%Y-%m-%d %H:%M", time.strptime(s_time, "%Y-%m-%d %H:%M"))# 6天前 elif u'天前' in s_time: days = re.findall(u'(\d+)天前', s_time)[0] result_time = (datetime.now() - timedelta(days=int(days))).strftime("%Y-%m-%d %H:%M:%S") print(3) # 1小时前 elif u'小时前' in s_time: hours = re.findall(u'(\d+)小时前', s_time)[0] hours_ago = (datetime.now() - timedelta(hours=int(hours))).strftime("%Y-%m-%d %H:%M:%S") result_time = hours_ago # 28分钟前 elif u'分钟前' in s_time: minutes = re.findall(u'(\d+)分钟前', s_time)[0] minutes_ago = (datetime.now() - timedelta(minutes=int(minutes))).strftime("%Y-%m-%d %H:%M:%S") result_time = minutes_ago elif re.findall(r'\d{1,2}-\d{1,2}', s_time) and len(s_time) <= 5: now_year = str(datetime.now().year) _time = now_year + '-' + s_time result_time = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(_time, "%Y-%m-%d")) # 12-22 15:07 elif re.findall(r'\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}', s_time): last_time = re.findall(r'.*?(\d{1,2}:\d{1,2})', s_time)[0] days_ago = datetime.now() - timedelta(days=int(1)) y_m_d = str(days_ago.year) + '-' + str(days_ago.month) + '-' + str(days_ago.day) _time = y_m_d + ' ' + last_time result_time = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(_time, "%Y-%m-%d")) elif len(re.findall(r"\d+\.?\d*", s_time)) == 5: time_list = re.findall(r"\d+\.?\d*", s_time) # 只取数字 times = time_list[0] + '-' + time_list[1] + '-' + time_list[2] + ' ' + time_list[3] + ':' + time_list[ 4] + ':' + '00' # 改成xxxx-xx-xx xx:xx格式 result_time = datetime.strptime(times, "%Y-%m-%d %H:%M:%S") return result_time # 读取fiddler抓包保存在txt中的response内容 def get_url(self,source): pathtxt = 'E:/Fiddler Sessions/Sessions.txt' with open(pathtxt, 'r') as f: txt = f.read() text = txt.split('Request-body:')[-1] global false, null, true false = null = true = '' if len(text) > 0: content_dict = eval(text) if 'group' in content_dict.keys(): # 动态类 group = content_dict['group'] username = group['user_name'] if 'fw_id' in group: article_id = group['fw_id'] url = 'https://www.toutiao.com/w/i' + str(article_id) + '/' else: repost_params = content_dict['repost_params'] article_id = repost_params['fw_id'] url = 'https://www.toutiao.com/w/i' + str(article_id) + '/' else: # 文章类 data = content_dict['data'] share = data['share_info'] url = share['share_url'] username = data['source'] handle = 2 platform_id = 55 classify = '苏州' print("url:",url) avatar, times, picture, content, title = self.get_content(url) publish_date = self.parse_time(times) print("标题:", title) print("发表时间:", publish_date) print('头像:', avatar) print('图片:', picture) print("内容:", content) f.close() # self.insert_op(title, username ,url, publish_date, picture, content, platform_id, handle, classify, source) # file = open("E:/Fiddler Sessions/Sessions.txt", "w+") # 文件如果不存在就创建 # file.truncate() # 清空文本内容 # file.close() # 头条苏州板块自动化 def toutiao(self): while True: area_status = self.isElement('id', 'com.ss.android.article.news:id/eyu') # 地区是否存在 if area_status: area = self.driver.find_element_by_id('com.ss.android.article.news:id/eyu').__getattribute__('text') if "m" in area: if '·' in area: address = area.split(' ')[0] source = address.split('·')[-1] else: if '·' in area: address = area source = address.split('·')[-1] print("地区:", source) else: source = '苏州' dynamic_status = self.isElement('id', 'com.ss.android.article.news:id/d30') if dynamic_status: self.driver.find_element_by_id('com.ss.android.article.news:id/d30').click() # 点击动态 time.sleep(1) self.get_url(source) self.driver.back() # 回到主页面 time.sleep(1) self.driver.swipe(100, 1300, 100, 800, 2000) time.sleep(1) #获取文章动态内容 def get_content(self, url): s = requests.session() headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36', } cookies = {'tt_webid': '6944229382073796126', 'csrftoken': '1fc04d1168f1dda1f56009c05e912386', 'passport_csrf_token_default': 'ad7a083c657ff583a0ec4fd7e1121e3b', 'passport_csrf_token': 'ad7a083c657ff583a0ec4fd7e1121e3b', 'sso_uid_tt_ss': 'baa07f37daa59fba18c57e68c235a7cd', 'toutiao_sso_user': '0901228a2038e713243a9a8e3a183ecb', 'toutiao_sso_user_ss': '0901228a2038e713243a9a8e3a183ecb', 'sso_uid_tt': 'baa07f37daa59fba18c57e68c235a7cd', 'uid_tt': '40aa4013460012d41179aa9a67e88799', 'sid_tt': '621f8eef127fd68c50990faf77a6e5f5', 'sessionid': '621f8eef127fd68c50990faf77a6e5f5', 'sid_guard': '621f8eef127fd68c50990faf77a6e5f5%7C1616829441%7C5184000%7CWed%2C+26-May-2021+07%3A17%3A21+GMT', 'uid_tt_ss': 'baa07f37daa59fba18c57e68c235a7cd', 'sessionid_ss': '0901228a2038e713243a9a8e3a183ecb', 's_v_web_id': 'verify_kmvhdyi1_d6py15Ud_f1PY_4PTf_8RGz_Nhpd8cDzmlRP', '__ac_nonce': '06062e596007f40d1b10e', '__ac_signature': '_02B4Z6wo00f011l60QQAAIDCjrz8f7ADJrdZXtWAALZHoTjmFJ2qk-rEha2WUWEoFdctDlVmqEwuZI5-BXprf3Ni1NXpPz4e4MIAJFDWnHASvBGpIoCjRgDL.GIC9YDAFlIAoSl12oWY30hI8a', '_tea_utm_cache_2256': '{%22utm_source%22:%22copy_link%22%2C%22utm_medium%22:%22toutiao_android%22%2C%22utm_campaign%22:%22client_share%22}', 'MONITOR_WEB_ID': '9c51c9ee-7387-466f-8e73-916a734bdce1', 'tt_anti_token': 'IOoFvgLmWzi-89c8f0f20a1e5831f03b5ee9f9359d7c501c4c594e7b0e33fff69859992c5ebe', 'tt_scid': 'pVhdZDPy5On22JTkfQx9HsD94M6UP00Ej6ZSRvByY9xLO3TjpsW7f5l9Tlv8m.QX948f'} response = s.get(url=url, headers=headers, timeout=None, verify=False, cookies=cookies) response.encoding = response.apparent_encoding response_text = response.text if response.status_code == 200: soup = BeautifulSoup(response_text, 'html.parser') if soup.find(attrs={'class':'article-content'}): article = soup.find(attrs={'class':'article-content'}) title = article.find_all('h1')[0].text else: title = '' if soup.find(attrs={'class': 'user-avatar'}): # 文章类 a = soup.find(attrs={'class': 'user-avatar'}) avatar = a.find_all('img')[0].get('src') else: div = soup.find(attrs={'class': 'author-info'}) # 动态类 avatar = div.find_all('img')[0].get('src') if soup.find(attrs={'class': 'publish-time'}): times = soup.find(attrs={'class': 'publish-time'}).text else: text = soup.find(attrs={'class': 'article-meta'}) if len(text.find_all('span')) == 3: times = text.find_all('span')[2].text else: times = text.find_all('span')[1].text if soup.find(attrs={'class': 'article-content'}): # 动态类 content_div = soup.find(attrs={'class': 'article-content'}) content = content_div.find_all('article')[0].text elif soup.find(attrs={'weitoutiao-html'}): content = soup.find(attrs={'weitoutiao-html'}).text else: content = '' if soup.find(attrs={'class': 'pgc-img'}): img = soup.find(attrs={'class': 'pgc-img'}) if len(img.find_all('img')) > 0: picture = img.find_all('img')[0].get('src') else: picture = '' else: img = soup.find(attrs={'class': 'image-list'}) if len(img.find_all('img')) > 0: picture = "https:" + img.find_all('img')[0].get('src') else: picture = '' return avatar, times, picture, content,title

 

posted @ 2021-03-30 20:50  Eliphaz  阅读(499)  评论(0编辑  收藏  举报