alex_bn_lee

导航

< 2025年3月 >
23 24 25 26 27 28 1
2 3 4 5 6 7 8
9 10 11 12 13 14 15
16 17 18 19 20 21 22
23 24 25 26 27 28 29
30 31 1 2 3 4 5

统计

【439】Tweets processing by Python

参数说明:

  • coordinates:Represents the geographic location of this Tweet as reported by the user or client application. The inner coordinates array is formatted as geoJSON (longitude first, then latitude).

1.文本文件转 json 格式

  读取 txt 文件中的 tweets 文本,将其转为 json 格式,可以打印输出,也可以提取详细信息

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import json
import os
 
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
 
# get the first txt file
tweets_data_path = files[0]
 
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
# print json format file with indentation
print(json.dumps(tweets_data[0], indent=4))

输出:

 

2. 读取关键字内容

  通过 .keys() 获取所有的键值

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import json
import os
 
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
 
# get the first txt file
tweets_data_path = files[0]
 
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
         
for k in tweets_data[0].keys():
    print(k)

 输出:

 

3. 输出键值信息

 代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import json
import os
 
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
 
# get the first txt file
tweets_data_path = files[0]
 
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
         
for k in tweets_data[0].keys():
    print(k, ":", tweets_data[0][k])
    print()

 输出:

 

4. 输出二级键值,与2,3类似

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import json
import os
 
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
 
# get the first txt file
tweets_data_path = files[0]
 
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
         
for k2 in tweets_data[0]["user"]:
    print(k2,":",tweets_data[0]["user"][k2])

输出:

 

5. Tweets to csv and reading csv

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import json
import os
import codecs
 
folderpath = r"D:\Twitter Data\Data"
files = os.listdir(folderpath)
os.chdir(folderpath)
 
fo = open(r"D:\Twitter Data\Data\test\tweets.csv", "w")
fo.write("\ufeff")
fo.write("id,created_at,coordinates,co_lon,co_lat,geo,geo_lat,geo_lon," +
         "user_location,place_type,place_name," +
         "place_full_name,place_country,place_bounding_box,pb_avg_lon,pb_avg_lat," +
         "lang,source,text")
count = 0
 
for file in files:
    # determine is file or directory
    if os.path.isdir(file):
        continue
         
    count += 1
    print(count, ":", file)
    #if count < 100:
    #    continue
     
    tweets_file = open(file, "r")
    for line in tweets_file:
        try:
            #count += 1
            #if (count < 53850):
            #    continue
            tweet = json.loads(line)
            csv_text = "\n"
            # id
            csv_text += tweet["id_str"]
            csv_text += ","
            # created_at
            csv_text += str(tweet["created_at"])
            csv_text += ","
            # coordinates
            if (tweet["coordinates"]):
                csv_text += "Yes,"
                csv_text += str(tweet["coordinates"]["coordinates"][0])
                csv_text += ","
                csv_text += str(tweet["coordinates"]["coordinates"][1])
            else:
                csv_text += "None,None,None"
            csv_text += ","
            # geo
            if (tweet["geo"]):
                csv_text += "Yes,"
                csv_text += str(tweet["geo"]["coordinates"][0])
                csv_text += ","
                csv_text += str(tweet["geo"]["coordinates"][1])
            else:
                csv_text += "None,None,None"
            csv_text += ","
            # user->location
            ul = str(tweet["user"]["location"])
            ul = ul.replace("\n", " ")
            ul = ul.replace("\"", "")
            ul = ul.replace("\'", "")
            csv_text += "\"" + ul + "\""
            csv_text += ","
            # place->type
            csv_text += str(tweet["place"]["place_type"])
            csv_text += ","
            # place->name
            csv_text += "\"" + str(tweet["place"]["name"]) + "\""
            csv_text += ","
            # place->full_name
            csv_text += "\"" + str(tweet["place"]["full_name"]) + "\""
            csv_text += ","
            # place->country
            csv_text += "\"" + str(tweet["place"]["country"]) + "\""
            csv_text += ","
            # place->bounding_box
            if (tweet["place"]["bounding_box"]["coordinates"]):
                # min_lon
                min_lon = tweet["place"]["bounding_box"]["coordinates"][0][0][0]
                # min_lat
                min_lat = tweet["place"]["bounding_box"]["coordinates"][0][0][1]
                # max_lon
                max_lon = tweet["place"]["bounding_box"]["coordinates"][0][2][0]
                # max_lat
                max_lat = tweet["place"]["bounding_box"]["coordinates"][0][2][1]
                # avg of lon and lat
                lon = (min_lon + max_lon)/2
                lat = (min_lat + max_lat)/2
                csv_text += "Yes,"
                csv_text += str(lon)
                csv_text += ","
                csv_text += str(lat)
            else:
                csv_text += "None, None, None"
            csv_text += ","
            # lang
            csv_text += str(tweet["lang"])
            csv_text += ","
            # source
            csv_text += "\"" + str(tweet["source"]) + "\""
            csv_text += ","
            # text
            # replace carriage return, double quotation marks, single quotation marks with space or nothing
            text = str(tweet["text"])
            text = text.replace("\r", " ")
            text = text.replace("\n", " ")
            text = text.replace("\"", "")
            text = text.replace("\'", "")
            csv_text += "\"" + text + "\""
            fo.write(csv_text)
            #if (count > 53851):
            #    break
        except:
            continue
     
    #if count > 150:
    #    break
         
fo.close()  
 
import pandas as pd
df = pd.read_csv(open(r"D:\Twitter Data\Data\test\tweets.csv", encoding='utf-8',errors='ignore'))
df.head()

  

 

 

 

posted on   McDelfino  阅读(256)  评论(0编辑  收藏  举报

编辑推荐:
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
阅读排行:
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· .NET10 - 预览版1新功能体验(一)
点击右上角即可分享
微信分享提示