一个小爬虫
需求:
获取一个展位号和公司名称,具体链接如下
http://www.cr-expo.com/cn/zhanshang.aspx?id=2020
实现
import urllib.request
import pandas as pd
from bs4 import BeautifulSoup
def to_excel(writer, datas, df, sheet_name):
try:
for data in datas:
df.loc[datas.index(data) + 1] = data
df.to_excel(excel_writer=writer, sheet_name=sheet_name, index=None)
except Exception as e:
print("写入excel失败:%s" % e)
def get_html(url):
web = urllib.request.urlopen(url)
soup = BeautifulSoup(web,'html.parser',from_encoding='utf-8')
data = soup.find_all("ul",class_="zs_three")
return data
def get_booth_data(url):
data = get_html(url)
booth_data = data[0].find_all('li')
datas =[]
for line in booth_data:
dict_data = {}
dict_data["展位号"] = line.find("h4").get_text().split(':')[1]
dict_data["公司名称"] = line.find("h1").get_text()
datas.append(dict_data)
return datas
def sava_excel(datas):
file = "展位信息.xlsx"
writer = pd.ExcelWriter(file)
df = pd.DataFrame(
columns=('展位号', '公司名称')
)
to_excel(writer, datas, df, sheet_name="展位信息")
writer.save()
writer.close()
if __name__ == '__main__':
base_url = "http://www.cr-expo.com/cn/zhanshang.aspx?id=2020"
datas = []
for i in range(1,33):
url = base_url+ "&page="+str(i)
datas.extend(get_booth_data(url))
sava_excel(datas)