python -使用Requests库完成Post表单操作

"""
使用Requests库完成Post表单操作
"""
#_*_codingn:utf8 _*_
import requests

from bs4 import BeautifulSoup

'''
  设置请求头,让程序发出的请求更像来源于浏览器
'''
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

if __name__ == "__main__":

    params ={"username": "anything","password": "password"}

    session =requests.session()
    post_obj = session.post("http://pythonscraping.com/pages/cookies/welcome.php", params)

    s = session.get("http://pythonscraping.com/pages/cookies/profile.php")
    print(post_obj.text.encode("utf-8"))
    print(s.text.encode("utf-8"))

    #session.cookies.get_dict()  #获取cooking
    print(session.cookies.get_dict())

 

# -*- coding: utf-8 -*-
'''
目标站点分析
网页结构分析
--开干--
1、单页内容
2、正则
3、保存json
4、多线程循环
'''
# .*具有贪婪的性质,首先匹配到不能匹配为止,根据后面的正则表达式,会进行回溯。
# .*?(短)则相反,一个匹配以后,就往下进行,所以不会进行回溯,具有最小匹配的性质。
# re.S 让.匹配换行符
#----------------------------------
import json
import requests
from requests.exceptions import RequestException
import re
import time
from multiprocessing import Pool

headers = {  # 非常重要
    'Accept-Language': 'en-US,en;q=0.8',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
    'Connection': 'keep-alive',
    'Referer': 'http://maoyan.com/board/6'
}

def get_one_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None  # 非200
    except RequestException:
        return None

def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {  # 变成生成器
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],  # 字符串处理 (移除字符串头尾指定的字符序列)
            'time': item[4].strip()[5:],
            'score': item[5] + item[6]  # 分开匹配加起来
        }

def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:  # 编码3
        f.write(json.dumps(content, ensure_ascii=False) + '\n')  # json.dumps 序列化时对中文默认使用的ascii编码

def main(offset):
    url = 'http://maoyan.com/board/4?offset=' + str(offset)
    html = get_one_page(url)  # return返回参数
    # print(html)
    for item in parse_one_page(html):
        # print(item)
        write_to_file(item)


if __name__ == '__main__':
    for i in range(10):
        main(offset=i * 10)
        time.sleep(1)
    # 进程池
    # pool=Pool()

    # pool.map(main,[i*10 for i in range(10)])
# coding=utf-8

'''
1、抓取索引页内容
2、抓取详情页内容
3、下载图片保存数据库
4、循环及多线程
'''

import requests
from requests.exceptions import RequestException
from json import loads
from bs4 import BeautifulSoup
user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
headers = {"User-Agent": user_agent}


def get_onepage_index(i, keywords):
    data = {
        "offset": i,
        "format": "json",
        "keyword": keywords,
        "autoload": "true",
        "count": "20",
        "cur_tab": "1",
        "from": "search_tab"
    }
    url = 'https://www.toutiao.com/search_content/?'
    try:
        response = requests.get(url, params=data)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print('something is wrong!')
        return None


def parse_onepage_index(html):
    # json.loads()用于将str类型的数据转成dict。
    data = loads(html)
    if data and 'data' in data.keys():  ##获取所有的key 值
        for item in data.get('data'):  # get() 函数返回指定键的值,如果值不在字典中返回默认值。
            yield item.get('article_url')


def get_page_detail(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # print(response.status_code)
            return response.text
        return None
    except RequestException:
        print('wrong url:', url)
        return None


def parsepage(html):
    soup = BeautifulSoup(html, 'lxml')
    title = soup.title.string
    print(title)


def main():
    for i in range(1, 2):
        i = str(i * 20)
        html = get_onepage_index(i, '街拍')
        parse_onepage_index(html)
        for url in parse_onepage_index(html):
            print(url)
            detailhtml = get_page_detail(url)  # 返回网页文本
            # print(detailhtml)
            if detailhtml == None:
                pass
            else:
                parsepage(detailhtml)  # bs4去解析


# get_page_detail('http://toutiao.com/group/6596305324645286404/')

if __name__ == '__main__':
      main()

 

如有疑问,请留言。

如觉得有帮助,请点个赞,谢谢!

posted @ 2018-11-05 16:27  划边逅  阅读(6887)  评论(0编辑  收藏  举报