【439】Tweets processing by Python

参数说明：

coordinates：Represents the geographic location of this Tweet as reported by the user or client application. The inner coordinates array is formatted as geoJSON (longitude first, then latitude).

1.文本文件转 json 格式

　　读取 txt 文件中的 tweets 文本，将其转为 json 格式，可以打印输出，也可以提取详细信息

代码：

import json
import os
 
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
 
# get the first txt file
tweets_data_path = files[0]
 
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
# print json format file with indentation
print(json.dumps(tweets_data[0], indent=4))

输出：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

{
    "created_at": "Tue Jun 25 20:44:34 +0000 2019",
    "id": 1143621025550049280,
    "id_str": "1143621025550049280",
    "text": "Australia beat the Poms overnight \ud83d\ude01\ud83c\udfcf\ud83c\udde6\ud83c\uddfa\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f #AUSvENG #CmonAussie #CWC19",
    "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
        "id": 252426781,
        "id_str": "252426781",
        "name": "Willy Aitch",
        "screen_name": "WillyAitch",
        "location": "Melbourne, Victoria",
        "url": null,
        "description": "September 2017 to February 2018, was the greatest 5 months ever. Richmond \ud83d\udc2f\ud83d\udc2f\ud83d\udc2fwon the 2017 AFL Premiership! Philadelphia Eagles \ud83e\udd85\ud83e\udd85\ud83e\udd85 won Super Bowl LII",
        "translator_type": "none",
        "protected": false,
        "verified": false,
        "followers_count": 417,
        "friends_count": 1061,
        "listed_count": 15,
        "favourites_count": 18852,
        "statuses_count": 17796,
        "created_at": "Tue Feb 15 04:55:59 +0000 2011",
        "utc_offset": null,
        "time_zone": null,
        "geo_enabled": true,
        "lang": null,
        "contributors_enabled": false,
        "is_translator": false,
        "profile_background_color": "C0DEED",
        "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
        "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
        "profile_background_tile": false,
        "profile_link_color": "1DA1F2",
        "profile_sidebar_border_color": "C0DEED",
        "profile_sidebar_fill_color": "DDEEF6",
        "profile_text_color": "333333",
        "profile_use_background_image": true,
        "profile_image_url": "http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg",
        "profile_image_url_https": "https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg",
        "profile_banner_url": "https://pbs.twimg.com/profile_banners/252426781/1522377977",
        "default_profile": true,
        "default_profile_image": false,
        "following": null,
        "follow_request_sent": null,
        "notifications": null
    },
    "geo": null,
    "coordinates": null,
    "place": {
        "id": "01864a8a64df9dc4",
        "url": "https://api.twitter.com/1.1/geo/id/01864a8a64df9dc4.json",
        "place_type": "city",
        "name": "Melbourne",
        "full_name": "Melbourne, Victoria",
        "country_code": "AU",
        "country": "Australia",
        "bounding_box": {
            "type": "Polygon",
            "coordinates": [
                [
                    [
                        144.593742,
                        -38.433859
                    ],
                    [
                        144.593742,
                        -37.511274
                    ],
                    [
                        145.512529,
                        -37.511274
                    ],
                    [
                        145.512529,
                        -38.433859
                    ]
                ]
            ]
        },
        "attributes": {}
    },
    "contributors": null,
    "is_quote_status": false,
    "quote_count": 0,
    "reply_count": 0,
    "retweet_count": 0,
    "favorite_count": 0,
    "entities": {
        "hashtags": [
            {
                "text": "AUSvENG",
                "indices": [
                    46,
                    54
                ]
            },
            {
                "text": "CmonAussie",
                "indices": [
                    55,
                    66
                ]
            },
            {
                "text": "CWC19",
                "indices": [
                    67,
                    73
                ]
            }
        ],
        "urls": [],
        "user_mentions": [],
        "symbols": []
    },
    "favorited": false,
    "retweeted": false,
    "filter_level": "low",
    "lang": "en",
    "timestamp_ms": "1561495474599"
}

2. 读取关键字内容

　　通过 .keys() 获取所有的键值

代码：

import json
import os
 
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
 
# get the first txt file
tweets_data_path = files[0]
 
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
         
for k in tweets_data[0].keys():
    print(k)

输出：

created_at
id
id_str
text
source
truncated
in_reply_to_status_id
in_reply_to_status_id_str
in_reply_to_user_id
in_reply_to_user_id_str
in_reply_to_screen_name
user
geo
coordinates
place
contributors
is_quote_status
quote_count
reply_count
retweet_count
favorite_count
entities
favorited
retweeted
filter_level
lang
timestamp_ms

3. 输出键值信息

代码：

import json
import os
 
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
 
# get the first txt file
tweets_data_path = files[0]
 
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
         
for k in tweets_data[0].keys():
    print(k, ":", tweets_data[0][k])
    print()

输出：

created_at : Tue Jun 25 20:44:34 +0000 2019
 
id : 1143621025550049280
 
id_str : 1143621025550049280
 
text : Australia beat the Poms overnight 😁🏏🇦🇺🏴󠁧󠁢󠁥󠁮󠁧󠁿 #AUSvENG #CmonAussie #CWC19
 
source : <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
 
truncated : False
 
in_reply_to_status_id : None
 
in_reply_to_status_id_str : None
 
in_reply_to_user_id : None
 
in_reply_to_user_id_str : None
 
in_reply_to_screen_name : None
 
user : {'id': 252426781, 'id_str': '252426781', 'name': 'Willy Aitch', 'screen_name': 'WillyAitch', 'location': 'Melbourne, Victoria', 'url': None, 'description': 'September 2017 to February 2018, was the greatest 5 months ever. Richmond 🐯🐯🐯won the 2017 AFL Premiership! Philadelphia Eagles 🦅🦅🦅 won Super Bowl LII', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 417, 'friends_count': 1061, 'listed_count': 15, 'favourites_count': 18852, 'statuses_count': 17796, 'created_at': 'Tue Feb 15 04:55:59 +0000 2011', 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/252426781/1522377977', 'default_profile': True, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None}
 
geo : None
 
coordinates : None
 
place : {'id': '01864a8a64df9dc4', 'url': 'https://api.twitter.com/1.1/geo/id/01864a8a64df9dc4.json', 'place_type': 'city', 'name': 'Melbourne', 'full_name': 'Melbourne, Victoria', 'country_code': 'AU', 'country': 'Australia', 'bounding_box': {'type': 'Polygon', 'coordinates': [[[144.593742, -38.433859], [144.593742, -37.511274], [145.512529, -37.511274], [145.512529, -38.433859]]]}, 'attributes': {}}
 
contributors : None
 
is_quote_status : False
 
quote_count : 0
 
reply_count : 0
 
retweet_count : 0
 
favorite_count : 0
 
entities : {'hashtags': [{'text': 'AUSvENG', 'indices': [46, 54]}, {'text': 'CmonAussie', 'indices': [55, 66]}, {'text': 'CWC19', 'indices': [67, 73]}], 'urls': [], 'user_mentions': [], 'symbols': []}
 
favorited : False
 
retweeted : False
 
filter_level : low
 
lang : en
 
timestamp_ms : 1561495474599

4. 输出二级键值，与2,3类似

代码：

import json
import os
 
folderpath = r"D:\Twitter Data\Data\test"
files = os.listdir(folderpath)
os.chdir(folderpath)
 
# get the first txt file
tweets_data_path = files[0]
 
# store json format file in this array
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
         
for k2 in tweets_data[0]["user"]:
    print(k2,":",tweets_data[0]["user"][k2])

输出：

id : 252426781
id_str : 252426781
name : Willy Aitch
screen_name : WillyAitch
location : Melbourne, Victoria
url : None
description : September 2017 to February 2018, was the greatest 5 months ever. Richmond 🐯🐯🐯won the 2017 AFL Premiership! Philadelphia Eagles 🦅🦅🦅 won Super Bowl LII
translator_type : none
protected : False
verified : False
followers_count : 417
friends_count : 1061
listed_count : 15
favourites_count : 18852
statuses_count : 17796
created_at : Tue Feb 15 04:55:59 +0000 2011
utc_offset : None
time_zone : None
geo_enabled : True
lang : None
contributors_enabled : False
is_translator : False
profile_background_color : C0DEED
profile_background_image_url : http://abs.twimg.com/images/themes/theme1/bg.png
profile_background_image_url_https : https://abs.twimg.com/images/themes/theme1/bg.png
profile_background_tile : False
profile_link_color : 1DA1F2
profile_sidebar_border_color : C0DEED
profile_sidebar_fill_color : DDEEF6
profile_text_color : 333333
profile_use_background_image : True
profile_image_url : http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg
profile_image_url_https : https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg
profile_banner_url : https://pbs.twimg.com/profile_banners/252426781/1522377977
default_profile : True
default_profile_image : False
following : None
follow_request_sent : None
notifications : None

5. Tweets to csv and reading csv

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

import json
import os
import codecs
 
folderpath = r"D:\Twitter Data\Data"
files = os.listdir(folderpath)
os.chdir(folderpath)
 
fo = open(r"D:\Twitter Data\Data\test\tweets.csv", "w")
fo.write("\ufeff")
fo.write("id,created_at,coordinates,co_lon,co_lat,geo,geo_lat,geo_lon," +
         "user_location,place_type,place_name," +
         "place_full_name,place_country,place_bounding_box,pb_avg_lon,pb_avg_lat," +
         "lang,source,text")
count = 0
 
for file in files:
    # determine is file or directory
    if os.path.isdir(file):
        continue
         
    count += 1
    print(count, ":", file)
    #if count < 100:
    #    continue
     
    tweets_file = open(file, "r")
    for line in tweets_file:
        try:
            #count += 1
            #if (count < 53850):
            #    continue
            tweet = json.loads(line)
            csv_text = "\n"
            # id
            csv_text += tweet["id_str"]
            csv_text += ","
            # created_at
            csv_text += str(tweet["created_at"])
            csv_text += ","
            # coordinates
            if (tweet["coordinates"]):
                csv_text += "Yes,"
                csv_text += str(tweet["coordinates"]["coordinates"][0])
                csv_text += ","
                csv_text += str(tweet["coordinates"]["coordinates"][1])
            else:
                csv_text += "None,None,None"
            csv_text += ","
            # geo
            if (tweet["geo"]):
                csv_text += "Yes,"
                csv_text += str(tweet["geo"]["coordinates"][0])
                csv_text += ","
                csv_text += str(tweet["geo"]["coordinates"][1])
            else:
                csv_text += "None,None,None"
            csv_text += ","
            # user->location
            ul = str(tweet["user"]["location"])
            ul = ul.replace("\n", " ")
            ul = ul.replace("\"", "")
            ul = ul.replace("\'", "")
            csv_text += "\"" + ul + "\""
            csv_text += ","
            # place->type
            csv_text += str(tweet["place"]["place_type"])
            csv_text += ","
            # place->name
            csv_text += "\"" + str(tweet["place"]["name"]) + "\""
            csv_text += ","
            # place->full_name
            csv_text += "\"" + str(tweet["place"]["full_name"]) + "\""
            csv_text += ","
            # place->country
            csv_text += "\"" + str(tweet["place"]["country"]) + "\""
            csv_text += ","
            # place->bounding_box
            if (tweet["place"]["bounding_box"]["coordinates"]):
                # min_lon
                min_lon = tweet["place"]["bounding_box"]["coordinates"][0][0][0]
                # min_lat
                min_lat = tweet["place"]["bounding_box"]["coordinates"][0][0][1]
                # max_lon
                max_lon = tweet["place"]["bounding_box"]["coordinates"][0][2][0]
                # max_lat
                max_lat = tweet["place"]["bounding_box"]["coordinates"][0][2][1]
                # avg of lon and lat
                lon = (min_lon + max_lon)/2
                lat = (min_lat + max_lat)/2
                csv_text += "Yes,"
                csv_text += str(lon)
                csv_text += ","
                csv_text += str(lat)
            else:
                csv_text += "None, None, None"
            csv_text += ","
            # lang
            csv_text += str(tweet["lang"])
            csv_text += ","
            # source
            csv_text += "\"" + str(tweet["source"]) + "\""
            csv_text += ","
            # text
            # replace carriage return, double quotation marks, single quotation marks with space or nothing
            text = str(tweet["text"])
            text = text.replace("\r", " ")
            text = text.replace("\n", " ")
            text = text.replace("\"", "")
            text = text.replace("\'", "")
            csv_text += "\"" + text + "\""
            fo.write(csv_text)
            #if (count > 53851):
            #    break
        except:
            continue
     
    #if count > 150:
    #    break
         
fo.close()   
 
import pandas as pd
df = pd.read_csv(open(r"D:\Twitter Data\Data\test\tweets.csv", encoding='utf-8',errors='ignore'))
df.head()