爬虫学习笔记整理一
tips
- 不论爬取哪个网页,都可以加上请求头信息
requests使用代理
import requests
url = "http://httpbin.org/ip"#访问这个地址会返回访问者的ip地址
proxies = {'http':'119.39.68.252:8118'}
resp = requests.get(url, proxies=proxies)
print(resp.text)
lxml是用C语言写的,没有提示
- 解析html字符串,使用etree.HTML(htmlstr)进行解析
- 解析html文件,使用etree.parse(filepath,parser=etree.HTMLParser(encoding="utf-8"))
- print(resp.text)#返回的是经过解码后的字符串,是str(unicode)类型,有时候会出现解码为乱码的情况,这时就需要自己指定解码方式
- print(resp.content)#返回的是一个原生的字符串,就是从网页上抓取的没有经过处理的字符串,是bytes类型
resp = requests.get(url, headers=HEADERS)
text = resp.content.decode("gbk", errors="ignore") #加上errors='ignore',不然会报错
xpath语法
trs = table.xpath(".//tr[position()>2]")
for i in range(1,5):
url = "https://www.gushiwen.org/default_%s.aspx" % i
parse_page(url)
imgs = html.xpath("//a[@class='col-xs-6 col-sm-3']//img[@class!='gif']")
for img in imgs:
img_url = img.get('data-original')#获取标签的某个属性值
json字符串操作
在python中,只有基本数据类型才可以转换为json格式的字符串,即
int、float、str、list、dict、tuple
import json
# 1.将python对象转换成json串
persons = [
{'name':'张三', 'age':18, 'gender':'男'},
{'name':'张四', 'age':17, 'gender':'女'},
{'name':'张五', 'age':19, 'gender':'男'}
]
json_str = json.dumps(persons)
print(type(json_str), json_str)
# 2.将json数据存储到文件
with open("person.json", "w", encoding="utf-8") as f:
#f.write(json_str)#方式一
json.dump(persons, f, ensure_ascii=False)#方式二:直接将python对象dump到文件中,就不用再转换一步为字符串
# 将json字符串load成python对象
# 3.直接从文件中读取
jsonstr = '[{"name": "\u5f20\u4e09", "age": 18, "gender": "\u7537"}, {"name": "\u5f20\u56db", "age": 17, "gender": "\u5973"}, {"name": "\u5f20\u4e94", "age": 19, "gender": "\u7537"}]'
persons = json.loads(jsonstr)
for p in persons:
print(p)
print("-" * 50)
# 5.直接从文件读取
with open("person.json", "r", encoding="utf-8") as f:
persons = json.load(f)
print(type(persons))
for p in persons:
print(p)
存储为csv文件
import csv
# =============================================================================
# 读取csv文件
# =============================================================================
def read_csv_by_index():
"""以索引模式读取"""
with open("5-冀东水泥集团问题汇总【36家】.csv", "r", encoding="utf-8") as f:
#reader其实是一个迭代器
reader = csv.reader(f)
next(reader)#指针向下挪动一行
for row in reader:
hazard = {}
# print(row)
express = row[1]
ytype = row[2]
company = row[-1]
if express != '' and ytype != '' and company != '':
hazard = {'express': express, 'ytype': ytype, 'company': company}
print(hazard)
def read_csv_by_dict():
with open("5-冀东水泥集团问题汇总【36家】.csv", "r", encoding="utf-8") as f:
#使用DicReader创建的不会包含第一行表头,此时的迭代器reader返回的是一个字典
reader = csv.DictReader(f)
for line in reader:
print(line)
pass
# =============================================================================
# 写入数据到csv文件
# =============================================================================
def write_list_to_csv():
#定义表头
headers = ['username','age','sex']
#要写入的数据
data = [
('张三', 12, '男'),
('李三', 19, '男'),
('张五', 28, '男'),
('王小二', 18, '女'),
]
#encoding="utf-8", newline=""两个参数分别解决乱码和多余空行的问题
with open("person_list.csv", "w", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
writer.writerow(headers)#写入表头
writer.writerows(data)#写入数据
pass
def write_dict_to_csv():
headers = ['username','age','sex']
data = [
{'username':'张三', 'age': 18, 'sex':'男'},
{'username':'李三', 'age': 16, 'sex':'女'},
{'username':'张五', 'age': 18, 'sex':'女'},
{'username':'王小二', 'age': 19, 'sex':'男'},
]
with open("person_dict.csv", "w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, headers)
writer.writeheader() #这种字典方式需要调用方法才能写入表头
writer.writerows(data)
pass
if __name__ == "__main__":
write_dict_to_csv()
正则模块
title = re.sub(r'[\??\.。!!]', '', title)#处理掉文件路径中不符合要求的字符
suffix = os.path.splitext(img_url)[1]#获取图片后缀名
#写入图片到文件
data = requests.get(img_url).content
with open("images/"+filename, "wb") as f:
f.write(data)
selenium+chromedriver
chromedriver下载地址:
http://chromedriver.storage.googleapis.com/index.html
根据谷歌浏览器版本下载对应的驱动chromedriver
import time
from selenium import webdriver
def automation(url):
# driver_path是谷歌浏览器驱动chromedriver.exe的存放路径
driver_path = "F:\\python\\chromedriver_win32\\chromedriver.exe"
driver = webdriver.Chrome(executable_path=driver_path)
#接下来就可以使用driver去操作谷歌浏览器了
driver.get(url)
#获取网页源代码(一般都是通过属性)
print(driver.page_source)
"""关闭浏览器"""
time.sleep(5)
# driver.close()#退出当前页面
driver.quit()#退出浏览器
"""获取元素的方法一"""
#获取输入框
inputTag = driver.find_element_by_id("kw")
inputTag.send_keys("python")
if __name__ == "__main__":
url = "http://www.baidu.com"
automation(url)