Python爬虫
python中urllib, urllib2,urllib3, httplib,httplib2, request的区别
1. python3中只要记住:urllib , requests 两个库
GET一个URL >>> import urllib.request >>> with urllib.request.urlopen('http://www.python.org/') as f: ... print(f.read(300)) PUT一个请求 import urllib.request DATA=b'some data' req = urllib.request.Request(url='http://localhost:8080', data=DATA,method='PUT') with urllib.request.urlopen(req) as f: pass print(f.status) print(f.reason) 基本的HTTP认证 import urllib.request auth_handler = urllib.request.HTTPBasicAuthHandler() auth_handler.add_password(realm='PDQ Application', uri='https://mahler:8092/site-updates.py', user='klem', passwd='kadidd!ehopper') opener = urllib.request.build_opener(auth_handler) urllib.request.install_opener(opener) urllib.request.urlopen('http://www.example.com/login.html') 使用proxy proxy_handler = urllib.request.ProxyHandler({'http': 'http://www.example.com:3128/'}) proxy_auth_handler = urllib.request.ProxyBasicAuthHandler() proxy_auth_handler.add_password('realm', 'host', 'username', 'password') opener = urllib.request.build_opener(proxy_handler, proxy_auth_handler) opener.open('http://www.example.com/login.html') 添加头部 import urllib.request req = urllib.request.Request('http://www.example.com/') req.add_header('Referer', 'http://www.python.org/') r = urllib.request.urlopen(req) 更改User-agent import urllib.request opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] opener.open('http://www.example.com/') 使用GET时设置URL的参数 >>> import urllib.request >>> import urllib.parse >>> params = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0}) >>> url = "http://www.musi-cal.com/cgi-bin/query?%s" % params >>> with urllib.request.urlopen(url) as f: ... print(f.read().decode('utf-8')) ... 使用POST时设置参数 >>> import urllib.request >>> import urllib.parse >>> data = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0}) >>> data = data.encode('ascii') >>> with urllib.request.urlopen("http://requestb.in/xrbl82xr", data) as f: ... print(f.read().decode('utf-8')) ... 指定proxy >>> import urllib.request >>> proxies = {'http': 'http://proxy.example.com:8080/'} >>> opener = urllib.request.FancyURLopener(proxies) >>> with opener.open("http://www.python.org") as f: ... f.read().decode('utf-8') ... 不使用proxy, 覆盖环境变量的proxy >>> import urllib.request >>> opener = urllib.request.FancyURLopener({}) >>> with opener.open("http://www.python.org/") as f: ... f.read().decode('utf-8') ...
Requests 官方文档
http://docs.python-requests.org/zh_CN/latest/user/quickstart.html
http://docs.python-requests.org/zh_CN/latest/
BeautifulSoup基本用法总结
爬虫必备—BeautifulSoup
# 字符串处理函数整理 https://www.cnblogs.com/hardsoftware/p/6220374.html # https://www.cnblogs.com/OldJack/p/7455124.html import requests from requests_ntlm import HttpNtlmAuth from bs4 import BeautifulSoup #response = requests.get("http://e/CAD/Index",auth=HttpNtlmAuth('cnsvwsh00\\lh','xxx')) #response = requests.get("http://e/Share/Index",auth=HttpNtlmAuth('cnsvwsh00\\lh','xxx')) response = requests.get("http://e/CAD/Index",auth=HttpNtlmAuth('cnsvwsh00\\lh','xxx')) bs=BeautifulSoup(response.text,'html5lib') #格式化输出内容 #text=bs.prettify() #print(text) text=bs.find(class_="layui-container clearfix").find_all('a') #print(text) for a in text: if(a.string is not None): print(a.string) if(a['href'].startswith('/')): print ("http://e"+a['href']) else: print(a['href'])