python爬虫------------旅游的地点的爬取和可视化

1.选题背景

我国旅游行业的极速发展,因为之前疫情原因,使得国内旅游成为新风潮,由于国内疫情解封,使得中国成为最先开放旅游的国家,

本次项目可视化就是分析国内旅游的数据,分析适合出行旅游的时间与地点信息.

2.设计方案

 1.向目标网络发送请求

   2.获取数据 网页源码

   3.筛选我们需要的数据 网页源代码

   4.筛选数据 获取数据

   5.for循环  获取每一页的数据

   6.提前数据

   出发日期 天数 人均费用 人物 玩法.....

   7.保存数据

   8.多页爬取

   9.可视化分析根据项目主题,设计项目实施方案,包括实现思路与技术难点等

导入所需要的库

 

#筛选数据

import parsel

import csv 

import time

import random

import pandas

import matplotlib.pyplot as plt

from pyecharts import Map

import jieba

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from PIL import Image

from os import path

##数据可视化

import matplotlib.pyplot as plt

 2.数据的抓取

 

3.主题云词图

list_all = []

text = ''

with open('C:/Users/wdsa/Desktop/去哪儿.csv', 'r', encoding='utf8') as file:

    t = file.read()

    file.close()

for i in title_list:

        if type(i) == float:

            pass

        else:

            list_all.append(i)

  txt = " ".join(list_all)

    backgroud_Image = plt.imread('C:/Users/wdsa/Desktop/阳光.jpg')

    print('加载图片成功!')

    w = WordCloud(

        font_path="msyh.ttc",

        width=1000,

        height=800,

        background_color="white",

        stopwords=STOPWORDS,

        max_font_size=150

    )

    w.generate(txt)

    print('开始加载文本')

    img_colors = ImageColorGenerator(backgroud_Image)

    w.recolor(color_func=img_colors)

    plt.imshow(w)

    plt.axis('off')

    plt.show()

    d = path.dirname(__file__)

        # w.to_file(d,"C:/Users/wdsa/Desktop/wordcloud.jpg")!!!!!!!!!!!

    print('生成词云成功!')

 4.爬取浏览量前五的主题

 

6.出行方式云词图

 list_all_1 = []

txt_1 = ''

for j in GO_list:

    if i == 'nan':

        pass

    elif type(j) == float:

        pass

    else:

        list_all_1.append(j)

txt_1 = " ".join(list_all_1)

backgroud_Image = plt.imread('C:/Users/wdsa/Desktop/阳光.jpg')

print('加载图片成功!')

 pose= WordCloud(

        font_path="msyh.ttc",

        width=1000,

        height=800,

        background_color="white",

        stopwords=STOPWORDS,

        max_font_size=150,    )

pose.generate(txt_1)

print('开始加载文本')

 img = ImageColorGenerator(backgroud_Image)

w.recolor(color_func=img)

plt.imshow(pose)

plt.axis('off')

plt.show()

d = path.dirname(__file__)

print('生成词云成功!')

完整代码

import random

import time 

import pandas as pd

import requests

import parsel

import csv

import time

import random

import pandas

import matplotlib.pyplot as plt

from pyecharts import Map

 import jieba

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from PIL import Image

from os import path

import matplotlib.pyplot as plt

csv_qne = open('C:/Users/wdsa/Desktop/去哪儿.csv',"a",encoding = "utf-8",newline = "")

csv_writer = csv.writer(csv_qne)

csv_writer.writerow(['地点','浏览量','短评','日期','人物','天数','人均消费','详情页'])

for i in range(1,5):

    url = f'https://travel.qunar.com/travelbook/list.htm?page={i}&order=hot_heat'

    response = requests.get(url = url )

    print(response)

    data_html = response.text

    data_html = response.text

    selector = parsel.Selector(data_html)

    print(selector)

    for i in url_list:

        detail_id = i.replace('/youji/','')

        datail_url = 'https://travel.qunar.com/travelbook/note/' + detail_id

        response_1 = requests.get(url =datail_url)

        data_html_1= response_1.text

        selector_1 = parsel.Selector(data_html_1)

        title = selector_1.css('.b_crumb_cont *:nth-child(3)::text').get()

        comment = selector_1.css('.title.white::text').get()

        count = selector_1.css('.view_count::text').get()

        data = selector_1.css('#js_mainleft > div.b_foreword > ul > li.f_item.when > p > span.data::text').get()

        days = selector_1.css('#js_mainleft > div.b_foreword > ul > li.f_item.howloog > p > span.data::text').get()

        character = selector_1.css('#js_mainleft > div.b_foreword > ul > li.f_item.howlong > p > span.data::text').get()

        money = selector_1.css('#js_mainleft > div.b_foreword > ul > li.f_item.who > p > span.data::text').get()

        play_list = selector_1.css('#js_mainleft > dix.b_foreword > ul > li.f_item.how > p > span.data::text').get()

        csv_writer.writerow([title,comment,count,data,days,money,data,play_list,datail_url])

        time.sleep(1)

csv_qne.close()

title_list = []

speake = []

happer_day = []

count_list = []

days_list = [

GO_list = []

meony_list = []

url_list_to = []

af= pd.read_csv('C:/Users/wdsa/Desktop/去哪儿.csv') 

for i in af['地点']:

    title_list.append(i)

 for i in af['短评']:

    speake.append(i)

 for i in af['浏览量']:

    count_list.append(i)

 for i in af['日期']:

    days_list.append(i)

 for i in af['天数']:

    happer_day.append(i)

 for i in af['人物']:

    GO_list.append(i)

 for i in af['人均消费']:

    meony_list.append(i)

 for i in af['详情页']:

    url_list_to.append(i)

 df =pd.DataFrame(af)

 list_all = []

text = ''

with open('C:/Users/wdsa/Desktop/去哪儿.csv', 'r', encoding='utf8') as file:

    t = file.read()

    file.close()

   for i in title_list:

        if type(i) == float:

            pass

        else:

            list_all.append(i)

 txt = " ".join(list_all)

    backgroud_Image = plt.imread('C:/Users/wdsa/Desktop/阳光.jpg')

    print('加载图片成功!')

  w = WordCloud(

        font_path="msyh.ttc",

        width=1000,

        height=800,

        background_color="white",

        stopwords=STOPWORDS,

       max_font_size=150,

)

    w.generate(txt)

    print('开始加载文本')

    img_colors = ImageColorGenerator(backgroud_Image)

    w.recolor(color_func=img_colors)

    plt.imshow(w)

    plt.axis('off')

    plt.show()

    d = path.dirname(__file__)

    print('生成词云成功!')

plt.rcParams['font.sans-serif'] = ['SimHei']

plt.rcParams['axes.unicode_minus'] = False

plt.figure(figsize=(17, 15))

bar_width= 0.25 

plt.bar(title_list[:20:4],

        count_list[:5],

        bar_width,

        align="center",

        color="red",

        label="unpurchased",

        alpha=0.5)

plt.show()

plt.figure(figsize=(17, 15))

plt.plot(title_list[:20:4],

         count_list[:5],

          color="red",

            label='浏览量',

         marker='*',

         )

plt.show()

list_all_1 = []

txt_1 = ''

for j in GO_list:

    if i == 'nan':

        pass

    elif type(j) == float:

        pass

    else:

        list_all_1.append(j)

txt_1 = " ".join(list_all_1) 

backgroud_Image = plt.imread('C:/Users/wdsa/Desktop/阳光.jpg')

print('加载图片成功!')

pose= WordCloud(

        font_path="msyh.ttc",

        width=1000,

        height=800,

        background_color="white",

        stopwords=STOPWORDS,

        max_font_size=150,

    )

pose.generate(txt_1)

print('开始加载文本')

img = ImageColorGenerator(backgroud_Image)

w.recolor(color_func=img)

plt.imshow(pose)

plt.axis('off')

plt.show()

d = path.dirname(__file__)

        # w.to_file(d,"C:/Users/wdsa/Desktop/wordcloud.jpg")!!!!!!!!!!!

print('生成词云成功!')

 

 

 

 

 

总结

综上所有数据可知,我们用去哪儿网对于国内旅游城市进行了一定的分析以及排名,让人们出游有更加合理的选择,更体现国内疫情解封后每个城市旅行的情况

 

posted @ 2023-06-10 22:06  我真是小白  阅读(459)  评论(0编辑  收藏  举报