爬虫基础_谭希成

爬虫入门程序
import urllib.request

定义网址
url="http://www.baidu.com"
访问网址
responsel=urllib.request.urlopen(url)
打印状态码
print(responsel.getcode())
读取页面内容
print(responsel.read())

爬虫程序添加data、header，然后post请求

import urllib
from urllib import request
指定url
url="http://www.zhihu.com/signin?next=%2F"
请求头的内容
user_agent="Moailla/4.0(compatible;MSIE 5.5;Windows NT)"
表单的请求参数
values={"username":"17388888888","xxxxxxxx"}
data=urllib.parse.urlencode(values).encode(encoding='UTF-8')
构建请求头headers
headers={'User-Agent':user_agent,
               'Referer':'http://www.zhihu.com/articles'    
               }         
构建请求
req=request.Request(url,data=data,headers=headers,method='POST')
打开网页
resp=request.urlopen(req)
读取网页内容
print(resp.read())

爬虫程序添加cookie
from urllib import request
from http import cookiejar
声明一个CookieJar对象实例来保存cookie
cookie=cookiejar.CookieJar()
创建一个cookie处理器
handler=request.HTTPCookieProcessor(cookie)
定义一个下载器
opener=request.build_opener(handler)
下载页面
resp=openner.open('http://www.baidu.com')
遍历cookie
for item in cookie:
    print('NAME='+item.name)
    print('VALUE='+item.value)

正则表达式
import re
定义正则规则=模式，r表示原生字符串
pattern=re.compile(r'hello')加了r,\n就不是换行了匹配字符，match函数会从第一个字符开始匹配
result1=re.match(pattern,"htllno hello")加了.span()只显示位置：(0,5) 打印时result1加了.group()显示字符
if result1:
    print(result1.group())
result2=re.match(pattern,"htllohello")re.match(pattern,"hell hello")匹配不到 是从前面匹配
if result2:
    print(result2)
else:
    print('no')
定义正则规则    匹配a和c之间的字符
pattern=re.compile(r'a.c')  .表示a和c之间可以放任意一个字符,只能放一个定义一下被匹配的
res=re.match(pattern,"a2cdefg")
print(res)
匹配  反斜杠\
pattern2=re.compile('a\.c)#特殊字符需要用\转义 不能使用r来变成原生字符
res2=re.match(pattern2,'a.cedfd')
print(res2)
匹配部分字符
pattern3=re.compile(r'a[a,f,g]cdefg')a-f表示范围  a,f,g只能匹配a或f或g
res3=re.match(patter3,"afcdefg")

匹配数字
patter4=re.compile(r'a\dc')#  \D匹配非数字
res4=re.match(patter4,"a5c")

匹配空格
patter5=re.compile(r'a\sc')
res5=re.match(patter5,"a c")  \S匹配非空格

匹配个数 abc*表示后面可以匹配无数个b
patter6=re.compile(r'acb*')
res6=re.match(patter6,"dfgdgdkbbbbbbbb")

匹配abc或者efg
rx=re.compile(r'abc|efg')
res=re.search(rx,"dksjfkabc")

匹配2(若干)个abc
rx=compile(r'(abc){2}')
res=re.search(rx,"fiowejabcabcjjj")

rx=re.compile(r'(abc)(def)')分组 abc要和def连在一起才能匹配到
res=re.search(rx,"gegabcdefdgf")

rx=re.compile(r'(?P<p1>abc))给分组起一个别名p1
res=re.search(rx,"afdsfaabc")

rx=re.compile(r'(\d)abc\1')   \1:引用第一个匹配规则，即\d,引用后必须与前面的数字一样才能匹配
res=re.search(rx,"5hhiue5")

rx=re.compile(r'(?P<tt>abc)efg(?P=tt)')# (?P=tt)引用别名为tt的规则
res=re.search(rx,"abcefgabc")

rx=re.compile(r"\d*\w")   *是匹配0个到多个且尽可能少匹配
res=re.search(rx,"389747387dkjf")

rx=re.compile(r'\d{5,10}')
res=re.match(rx,'188888888')  匹配5-10个数字  

?表示0个到一个   +表示1个到多个且有多少匹配多少   \d{10}匹配10个数字

rx=re.compile(r'\d{5,11}@\w{2}\.com')
res=re.match(rx,'2591162883@qq.com)

贪婪模式
rx=re.compile(r'\w+')
res=re.match(rx,'dpgjfdoijgfosfdgjfgdj')

#边界匹配   ^表示以若干个字符为开头   $表示以若干个字符为结尾,$要结合search()使用  search可在任意位置匹配 match只在开头匹配
rx=re.compile(r'abc$')
res=re.search(rx,'wefweaabc')

rx=re.compile(r,'\Abea')   \A仅匹配开头的bea字符,\Z仅匹配末尾的指定的字符
res=re.search(rx,'beauti\nful')

rx=re.compile(r'ver\b')
res=re.search(rx,"never")  \b 匹配字符串开头或末尾的指定字符ver,放在指定字符的前面则匹配开头,放在指定字符的后面则匹配末尾