Python学习26
python—简单数据抓取日常三(IP地址代理)
学习内容:
1、简单IP地址代理
2、利用蘑菇代理实现IP地址代理刷新本地ip地址
3、利用蘑菇代理实现IP地址代理抓取安居客信息并实现多线程
1、简单IP地址代理
import requests
from lxml import etree
# 代理IP地址
proxy = {"http": "代理ip:端口号"}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
source = requests.get('http://2021.ip138.com/', headers=headers, proxy=proxy).text
demo = etree.HTML(source).xpath('/html/body/p[1]/a/text()')
content = etree.HTML(source).xpath('/html/body/p[1]/text()[2]')
print(demo)
print(content)
2、利用蘑菇代理实现IP地址代理刷新本地ip地址
import requests
from lxml import etree
# 蘑菇代理的隧道订单
appKey = "Nk1WTVBqODJDMlVmOWdkRDp5cGY2SWo0RGJzZGYzNnow"
# 蘑菇隧道代理服务器地址
ip_port = 'secondtransfer.moguproxy.com:9001'
# 代理IP地址
proxy = {"http": "http://" + ip_port, "https": "https://" + ip_port}
headers = {
"Proxy-Authorization": 'Basic ' + appKey,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4"
}
source = requests.get('http://2021.ip138.com/', headers=headers, proxies=proxy,verify=False,allow_redirects=False).text
demo = etree.HTML(source).xpath('/html/body/p[1]/a/text()')
content = etree.HTML(source).xpath('/html/body/p[1]/text()[2]')
print(demo)
print(content)
多次输出结果不同:
['106.35.173.120']
['] 来自:中国内蒙古包头 电信\n']
['223.242.246.60']
['] 来自:中国安徽淮南田家庵区 电信\n']
3、利用蘑菇代理实现IP地址代理抓取安居客信息并实现多线程
import requests
from lxml import etree
from multiprocessing import Pool
import re
# 蘑菇代理的隧道订单
appKey = "Nk1WTVBqODJDMlVmOWdkRDp5cGY2SWo0RGJzZGYzNnow"
# 蘑菇隧道代理服务器地址
ip_port = 'secondtransfer.moguproxy.com:9001'
# 代理IP地址
proxy = {"http": "http://" + ip_port, "https": "https://" + ip_port}
headers = {
"Proxy-Authorization": 'Basic ' + appKey,
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4"
}
lists = ['xiqing', 'tanggu', 'nankai', 'jinnan', 'wuqing', 'hedong', 'hexi', 'dongli']
for i in range(8):
def index(page):
source = requests.get('https://tianjin.anjuke.com/sale/jinnan/' + str(lists[i]) + '/p' + str(page) + '/?from=SearchBar', headers=headers, proxies=proxy, verify=False, allow_redirects=False).text
name = etree.HTML(source).xpath('//*[@id="__layout"]/div/section/section[3]/section[1]/section[2]/div/a/div[2]/div[1]/div[1]/h3/text()')
content1 = "".join(etree.HTML(source).xpath('//*[@id="__layout"]/div/section/section[3]/section[1]/section[2]/div/a/div[2]/div[1]/section/div[1]/p[1]/span/ text()'))
content2 = etree.HTML(source).xpath('// *[ @ id = "__layout"]/div/section/section[3]/section[1]/section[2]/div/a/div[2]/div[1]/section/div[2]/p/text()')
content3 = (etree.HTML(source).xpath('//*[@id="__layout"]/div/section/section[3]/section[1]/section[2]/div/a/div[2]/div[1]/section/div[3]/span/text()'))
print(lists[i])
print(name)
print(content1.replace('卫', '卫,'))
print(content2)
print(content3)
print('===========当前在第' + str(page) + '页=================')
if __name__ == '__main__':
p = Pool(1)
for page in range(1, 51):
p.apply_async(index, args=(page,))
print('Waiting for all subprocesses done...')
p.close()
p.join()
print('All subprocesses done.')