python + appium +fiddler 抓取今日头条

fiddler 获取response中json内容，在fiddler scrpit中 OnBeforeResponse函数内加以下代码

	//过滤无关请求，只关注特定请求
		if (oSession.fullUrl.Contains("所需网站"))
		{
			oSession.utilDecodeResponse();
			//消除保存的请求可能存在乱码的情况
			var fso;
			var file;
			fso = new ActiveXObject("Scripting.FileSystemObject");
			//文件保存路径，可自定义
			file = fso.OpenTextFile("文本路径",8 ,true);
			//file.writeLine("Request-url:" + oSession.url);
			file.writeLine("http://"+oSession.url)
			file.writeLine("Request-body:" + oSession.GetResponseBodyAsString());
			//file.writeLine("\n");
			file.close();
		}

class TouTiao():
    #初始化appium自动化配置属性
    def __init__(self):
        # 设备信息
        desired_caps = {　
　　　　　　　　xxxx}
        self.driver = webdriver.Remote("http://127.0.0.1:4723/wd/hub", desired_caps)
        time.sleep(3)
        self.driver.find_element_by_xpath('//android.view.View[@content-desc="苏州"]').click() #点击苏州
        time.sleep(2)
        self.driver.swipe(100, 1300, 100, 800, 1000) #滑动去除干扰
        time.sleep(1)

#判断元素是否存在
    def isElement(self, identifyBy, c):
        flag = None
        try:
            if identifyBy == "xpath":
                self.WebDriverWait(self.driver, timeout=10, poll_frequency=0.5).until(
                    lambda x: x.find_element_by_xpath(c),
                    message='xpath定位超时')
            elif identifyBy == "id":
                self.driver.find_element_by_id(c)
            flag = True
        except NoSuchElementException as e:
            flag = False
        finally:
            return flag

    #时间转换
    def parse_time(self, s_time):
        result_time = ''
        # 2018-09-29 04:29
        if re.findall(r'\d{1,4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}', s_time):
            result_time = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(s_time, "%Y-%m-%d %H:%M:%S"))
        elif re.findall(r'\d{1,4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}', s_time):
            result_time = time.strftime("%Y-%m-%d %H:%M", time.strptime(s_time, "%Y-%m-%d %H:%M"))# 6天前
        elif u'天前' in s_time:
            days = re.findall(u'(\d+)天前', s_time)[0]
            result_time = (datetime.now() - timedelta(days=int(days))).strftime("%Y-%m-%d %H:%M:%S")
            print(3)
        # 1小时前
        elif u'小时前' in s_time:
            hours = re.findall(u'(\d+)小时前', s_time)[0]
            hours_ago = (datetime.now() - timedelta(hours=int(hours))).strftime("%Y-%m-%d %H:%M:%S")
            result_time = hours_ago
        # 28分钟前
        elif u'分钟前' in s_time:
            minutes = re.findall(u'(\d+)分钟前', s_time)[0]
            minutes_ago = (datetime.now() - timedelta(minutes=int(minutes))).strftime("%Y-%m-%d %H:%M:%S")
            result_time = minutes_ago
       
        elif re.findall(r'\d{1,2}-\d{1,2}', s_time) and len(s_time) <= 5:
            now_year = str(datetime.now().year)
            _time = now_year + '-' + s_time
            result_time = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(_time, "%Y-%m-%d"))
        # 12-22 15:07
        elif re.findall(r'\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}', s_time):
            last_time = re.findall(r'.*?(\d{1,2}:\d{1,2})', s_time)[0]
            days_ago = datetime.now() - timedelta(days=int(1))
            y_m_d = str(days_ago.year) + '-' + str(days_ago.month) + '-' + str(days_ago.day)
            _time = y_m_d + ' ' + last_time
            result_time = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(_time, "%Y-%m-%d"))
        elif len(re.findall(r"\d+\.?\d*", s_time)) == 5:
            time_list = re.findall(r"\d+\.?\d*", s_time)  # 只取数字
            times = time_list[0] + '-' + time_list[1] + '-' + time_list[2] + ' ' + time_list[3] + ':' + time_list[
                4] + ':' + '00'  # 改成xxxx-xx-xx xx:xx格式
            result_time = datetime.strptime(times, "%Y-%m-%d %H:%M:%S")
        return result_time

    # 读取fiddler抓包保存在txt中的response内容
    def get_url(self,source):
        pathtxt = 'E:/Fiddler Sessions/Sessions.txt'
        with open(pathtxt, 'r') as f:
            txt = f.read()
            text = txt.split('Request-body:')[-1]
            global false, null, true
            false = null = true = ''
            if len(text) > 0:
                content_dict = eval(text)
                if 'group' in content_dict.keys():  # 动态类
                    group = content_dict['group']
                    username = group['user_name']
                    if 'fw_id' in group:
                        article_id = group['fw_id']
                        url = 'https://www.toutiao.com/w/i' + str(article_id) + '/'
                    else:
                        repost_params = content_dict['repost_params']
                        article_id = repost_params['fw_id']
                        url = 'https://www.toutiao.com/w/i' + str(article_id) + '/'
                else:  # 文章类
                    data = content_dict['data']
                    share = data['share_info']
                    url = share['share_url']
                    username = data['source']
                handle = 2
                platform_id = 55
                classify = '苏州'
                print("url：",url)
                avatar, times, picture, content, title = self.get_content(url)
                publish_date = self.parse_time(times)
                print("标题：", title)
                print("发表时间：", publish_date)
                print('头像：', avatar)
                print('图片：', picture)
                print("内容：", content)
                f.close()
        # self.insert_op(title, username ,url, publish_date, picture, content, platform_id, handle, classify, source)
        # file = open("E:/Fiddler Sessions/Sessions.txt", "w+")  # 文件如果不存在就创建
        # file.truncate()  # 清空文本内容
        # file.close()

    # 头条苏州板块自动化
    def toutiao(self):
        while True:
            area_status = self.isElement('id', 'com.ss.android.article.news:id/eyu')  # 地区是否存在
            if area_status:
                area = self.driver.find_element_by_id('com.ss.android.article.news:id/eyu').__getattribute__('text')
                if "m" in area:
                    if '·' in area:
                        address = area.split(' ')[0]
                        source = address.split('·')[-1]
                else:
                    if '·' in area:
                        address = area
                        source = address.split('·')[-1]
                print("地区：", source)
            else:
                source = '苏州'
            dynamic_status = self.isElement('id', 'com.ss.android.article.news:id/d30')
            if dynamic_status:
                self.driver.find_element_by_id('com.ss.android.article.news:id/d30').click()  # 点击动态
                time.sleep(1)
                self.get_url(source)
                self.driver.back()  # 回到主页面
                time.sleep(1)
            self.driver.swipe(100, 1300, 100, 800, 2000)
            time.sleep(1)

    #获取文章动态内容
    def get_content(self, url):
        s = requests.session()
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
        }
        cookies = {'tt_webid': '6944229382073796126', 'csrftoken': '1fc04d1168f1dda1f56009c05e912386',
                   'passport_csrf_token_default': 'ad7a083c657ff583a0ec4fd7e1121e3b',
                   'passport_csrf_token': 'ad7a083c657ff583a0ec4fd7e1121e3b',
                   'sso_uid_tt_ss': 'baa07f37daa59fba18c57e68c235a7cd',
                   'toutiao_sso_user': '0901228a2038e713243a9a8e3a183ecb',
                   'toutiao_sso_user_ss': '0901228a2038e713243a9a8e3a183ecb',
                   'sso_uid_tt': 'baa07f37daa59fba18c57e68c235a7cd', 'uid_tt': '40aa4013460012d41179aa9a67e88799',
                   'sid_tt': '621f8eef127fd68c50990faf77a6e5f5', 'sessionid': '621f8eef127fd68c50990faf77a6e5f5',
                   'sid_guard': '621f8eef127fd68c50990faf77a6e5f5%7C1616829441%7C5184000%7CWed%2C+26-May-2021+07%3A17%3A21+GMT',
                   'uid_tt_ss': 'baa07f37daa59fba18c57e68c235a7cd', 'sessionid_ss': '0901228a2038e713243a9a8e3a183ecb',
                   's_v_web_id': 'verify_kmvhdyi1_d6py15Ud_f1PY_4PTf_8RGz_Nhpd8cDzmlRP',
                   '__ac_nonce': '06062e596007f40d1b10e',
                   '__ac_signature': '_02B4Z6wo00f011l60QQAAIDCjrz8f7ADJrdZXtWAALZHoTjmFJ2qk-rEha2WUWEoFdctDlVmqEwuZI5-BXprf3Ni1NXpPz4e4MIAJFDWnHASvBGpIoCjRgDL.GIC9YDAFlIAoSl12oWY30hI8a',
                   '_tea_utm_cache_2256': '{%22utm_source%22:%22copy_link%22%2C%22utm_medium%22:%22toutiao_android%22%2C%22utm_campaign%22:%22client_share%22}',
                   'MONITOR_WEB_ID': '9c51c9ee-7387-466f-8e73-916a734bdce1',
                   'tt_anti_token': 'IOoFvgLmWzi-89c8f0f20a1e5831f03b5ee9f9359d7c501c4c594e7b0e33fff69859992c5ebe',
                   'tt_scid': 'pVhdZDPy5On22JTkfQx9HsD94M6UP00Ej6ZSRvByY9xLO3TjpsW7f5l9Tlv8m.QX948f'}
        response = s.get(url=url, headers=headers, timeout=None, verify=False, cookies=cookies)
        response.encoding = response.apparent_encoding
        response_text = response.text
        if response.status_code == 200:
            soup = BeautifulSoup(response_text, 'html.parser')
            if soup.find(attrs={'class':'article-content'}):
                article = soup.find(attrs={'class':'article-content'})
                title = article.find_all('h1')[0].text
            else:
                title = ''
            if soup.find(attrs={'class': 'user-avatar'}):  # 文章类
                a = soup.find(attrs={'class': 'user-avatar'})
                avatar = a.find_all('img')[0].get('src')
            else:
                div = soup.find(attrs={'class': 'author-info'})  # 动态类
                avatar = div.find_all('img')[0].get('src')
            if soup.find(attrs={'class': 'publish-time'}):
                times = soup.find(attrs={'class': 'publish-time'}).text
            else:
                text = soup.find(attrs={'class': 'article-meta'})
                if len(text.find_all('span')) == 3:
                    times = text.find_all('span')[2].text
                else:
                    times = text.find_all('span')[1].text
            if soup.find(attrs={'class': 'article-content'}):  # 动态类
                content_div = soup.find(attrs={'class': 'article-content'})
                content = content_div.find_all('article')[0].text
            elif soup.find(attrs={'weitoutiao-html'}):
                content = soup.find(attrs={'weitoutiao-html'}).text
            else:
                content = ''
            if soup.find(attrs={'class': 'pgc-img'}):
                img = soup.find(attrs={'class': 'pgc-img'})
                if len(img.find_all('img')) > 0:
                    picture = img.find_all('img')[0].get('src')
                else:
                    picture = ''
            else:
                img = soup.find(attrs={'class': 'image-list'})
                if len(img.find_all('img')) > 0:
                    picture = "https:" + img.find_all('img')[0].get('src')
                else:
                    picture = ''
            return avatar, times, picture, content,title

posted @ 2021-03-30 20:50 Eliphaz 阅读(499) 评论(0) 编辑收藏举报

刷新页面返回顶部

Eliphaz

python + appium +fiddler 抓取今日头条

公告