【439】Tweets processing by Python
参数说明:
- coordinates:Represents the geographic location of this Tweet as reported by the user or client application. The inner coordinates array is formatted as geoJSON (longitude first, then latitude).
1.文本文件转 json 格式
读取 txt 文件中的 tweets 文本,将其转为 json 格式,可以打印输出,也可以提取详细信息
代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | import json import os folderpath = r "D:\Twitter Data\Data\test" files = os.listdir(folderpath) os.chdir(folderpath) # get the first txt file tweets_data_path = files[ 0 ] # store json format file in this array tweets_data = [] tweets_file = open (tweets_data_path, "r" ) for line in tweets_file: try : tweet = json.loads(line) tweets_data.append(tweet) except : continue # print json format file with indentation print (json.dumps(tweets_data[ 0 ], indent = 4 )) |
输出:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | { "created_at" : "Tue Jun 25 20:44:34 +0000 2019" , "id" : 1143621025550049280 , "id_str" : "1143621025550049280" , "text" : "Australia beat the Poms overnight \ud83d\ude01\ud83c\udfcf\ud83c\udde6\ud83c\uddfa\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f #AUSvENG #CmonAussie #CWC19" , "source" : "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" , "truncated" : false, "in_reply_to_status_id" : null, "in_reply_to_status_id_str" : null, "in_reply_to_user_id" : null, "in_reply_to_user_id_str" : null, "in_reply_to_screen_name" : null, "user" : { "id" : 252426781 , "id_str" : "252426781" , "name" : "Willy Aitch" , "screen_name" : "WillyAitch" , "location" : "Melbourne, Victoria" , "url" : null, "description" : "September 2017 to February 2018, was the greatest 5 months ever. Richmond \ud83d\udc2f\ud83d\udc2f\ud83d\udc2fwon the 2017 AFL Premiership! Philadelphia Eagles \ud83e\udd85\ud83e\udd85\ud83e\udd85 won Super Bowl LII" , "translator_type" : "none" , "protected" : false, "verified" : false, "followers_count" : 417 , "friends_count" : 1061 , "listed_count" : 15 , "favourites_count" : 18852 , "statuses_count" : 17796 , "created_at" : "Tue Feb 15 04:55:59 +0000 2011" , "utc_offset" : null, "time_zone" : null, "geo_enabled" : true, "lang" : null, "contributors_enabled" : false, "is_translator" : false, "profile_background_color" : "C0DEED" , "profile_background_image_url" : "http://abs.twimg.com/images/themes/theme1/bg.png" , "profile_background_image_url_https" : "https://abs.twimg.com/images/themes/theme1/bg.png" , "profile_background_tile" : false, "profile_link_color" : "1DA1F2" , "profile_sidebar_border_color" : "C0DEED" , "profile_sidebar_fill_color" : "DDEEF6" , "profile_text_color" : "333333" , "profile_use_background_image" : true, "profile_image_url" : "http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg" , "profile_image_url_https" : "https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg" , "profile_banner_url" : "https://pbs.twimg.com/profile_banners/252426781/1522377977" , "default_profile" : true, "default_profile_image" : false, "following" : null, "follow_request_sent" : null, "notifications" : null }, "geo" : null, "coordinates" : null, "place" : { "id" : "01864a8a64df9dc4" , "url" : "https://api.twitter.com/1.1/geo/id/01864a8a64df9dc4.json" , "place_type" : "city" , "name" : "Melbourne" , "full_name" : "Melbourne, Victoria" , "country_code" : "AU" , "country" : "Australia" , "bounding_box" : { "type" : "Polygon" , "coordinates" : [ [ [ 144.593742 , - 38.433859 ], [ 144.593742 , - 37.511274 ], [ 145.512529 , - 37.511274 ], [ 145.512529 , - 38.433859 ] ] ] }, "attributes" : {} }, "contributors" : null, "is_quote_status" : false, "quote_count" : 0 , "reply_count" : 0 , "retweet_count" : 0 , "favorite_count" : 0 , "entities" : { "hashtags" : [ { "text" : "AUSvENG" , "indices" : [ 46 , 54 ] }, { "text" : "CmonAussie" , "indices" : [ 55 , 66 ] }, { "text" : "CWC19" , "indices" : [ 67 , 73 ] } ], "urls" : [], "user_mentions" : [], "symbols" : [] }, "favorited" : false, "retweeted" : false, "filter_level" : "low" , "lang" : "en" , "timestamp_ms" : "1561495474599" } |
2. 读取关键字内容
通过 .keys() 获取所有的键值
代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | import json import os folderpath = r "D:\Twitter Data\Data\test" files = os.listdir(folderpath) os.chdir(folderpath) # get the first txt file tweets_data_path = files[ 0 ] # store json format file in this array tweets_data = [] tweets_file = open (tweets_data_path, "r" ) for line in tweets_file: try : tweet = json.loads(line) tweets_data.append(tweet) except : continue for k in tweets_data[ 0 ].keys(): print (k) |
输出:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | created_at id id_str text source truncated in_reply_to_status_id in_reply_to_status_id_str in_reply_to_user_id in_reply_to_user_id_str in_reply_to_screen_name user geo coordinates place contributors is_quote_status quote_count reply_count retweet_count favorite_count entities favorited retweeted filter_level lang timestamp_ms |
3. 输出键值信息
代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | import json import os folderpath = r "D:\Twitter Data\Data\test" files = os.listdir(folderpath) os.chdir(folderpath) # get the first txt file tweets_data_path = files[ 0 ] # store json format file in this array tweets_data = [] tweets_file = open (tweets_data_path, "r" ) for line in tweets_file: try : tweet = json.loads(line) tweets_data.append(tweet) except : continue for k in tweets_data[ 0 ].keys(): print (k, ":" , tweets_data[ 0 ][k]) print () |
输出:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | created_at : Tue Jun 25 20 : 44 : 34 + 0000 2019 id : 1143621025550049280 id_str : 1143621025550049280 text : Australia beat the Poms overnight 😁🏏🇦🇺🏴 #AUSvENG #CmonAussie #CWC19 source : <a href = "http://twitter.com/download/iphone" rel = "nofollow" >Twitter for iPhone< / a> truncated : False in_reply_to_status_id : None in_reply_to_status_id_str : None in_reply_to_user_id : None in_reply_to_user_id_str : None in_reply_to_screen_name : None user : { 'id' : 252426781 , 'id_str' : '252426781' , 'name' : 'Willy Aitch' , 'screen_name' : 'WillyAitch' , 'location' : 'Melbourne, Victoria' , 'url' : None , 'description' : 'September 2017 to February 2018, was the greatest 5 months ever. Richmond 🐯🐯🐯won the 2017 AFL Premiership! Philadelphia Eagles 🦅🦅🦅 won Super Bowl LII' , 'translator_type' : 'none' , 'protected' : False , 'verified' : False , 'followers_count' : 417 , 'friends_count' : 1061 , 'listed_count' : 15 , 'favourites_count' : 18852 , 'statuses_count' : 17796 , 'created_at' : 'Tue Feb 15 04:55:59 +0000 2011' , 'utc_offset' : None , 'time_zone' : None , 'geo_enabled' : True , 'lang' : None , 'contributors_enabled' : False , 'is_translator' : False , 'profile_background_color' : 'C0DEED' , 'profile_background_image_url' : 'http://abs.twimg.com/images/themes/theme1/bg.png' , 'profile_background_image_url_https' : 'https://abs.twimg.com/images/themes/theme1/bg.png' , 'profile_background_tile' : False , 'profile_link_color' : '1DA1F2' , 'profile_sidebar_border_color' : 'C0DEED' , 'profile_sidebar_fill_color' : 'DDEEF6' , 'profile_text_color' : '333333' , 'profile_use_background_image' : True , 'profile_image_url' : 'http://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg' , 'profile_image_url_https' : 'https://pbs.twimg.com/profile_images/1112669591342211072/rnbV0dCK_normal.jpg' , 'profile_banner_url' : 'https://pbs.twimg.com/profile_banners/252426781/1522377977' , 'default_profile' : True , 'default_profile_image' : False , 'following' : None , 'follow_request_sent' : None , 'notifications' : None } geo : None coordinates : None place : { 'id' : '01864a8a64df9dc4' , 'url' : 'https://api.twitter.com/1.1/geo/id/01864a8a64df9dc4.json' , 'place_type' : 'city' , 'name' : 'Melbourne' , 'full_name' : 'Melbourne, Victoria' , 'country_code' : 'AU' , 'country' : 'Australia' , 'bounding_box' : { 'type' : 'Polygon' , 'coordinates' : [[[ 144.593742 , - 38.433859 ], [ 144.593742 , - 37.511274 ], [ 145.512529 , - 37.511274 ], [ 145.512529 , - 38.433859 ]]]}, 'attributes' : {}} contributors : None is_quote_status : False quote_count : 0 reply_count : 0 retweet_count : 0 favorite_count : 0 entities : { 'hashtags' : [{ 'text' : 'AUSvENG' , 'indices' : [ 46 , 54 ]}, { 'text' : 'CmonAussie' , 'indices' : [ 55 , 66 ]}, { 'text' : 'CWC19' , 'indices' : [ 67 , 73 ]}], 'urls' : [], 'user_mentions' : [], 'symbols' : []} favorited : False retweeted : False filter_level : low lang : en timestamp_ms : 1561495474599 |
4. 输出二级键值,与2,3类似
代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | import json import os folderpath = r "D:\Twitter Data\Data\test" files = os.listdir(folderpath) os.chdir(folderpath) # get the first txt file tweets_data_path = files[ 0 ] # store json format file in this array tweets_data = [] tweets_file = open (tweets_data_path, "r" ) for line in tweets_file: try : tweet = json.loads(line) tweets_data.append(tweet) except : continue for k2 in tweets_data[ 0 ][ "user" ]: print (k2, ":" ,tweets_data[ 0 ][ "user" ][k2]) |
输出:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | id : 252426781 id_str : 252426781 name : Willy Aitch screen_name : WillyAitch location : Melbourne, Victoria url : None description : September 2017 to February 2018 , was the greatest 5 months ever. Richmond 🐯🐯🐯won the 2017 AFL Premiership! Philadelphia Eagles 🦅🦅🦅 won Super Bowl LII translator_type : none protected : False verified : False followers_count : 417 friends_count : 1061 listed_count : 15 favourites_count : 18852 statuses_count : 17796 created_at : Tue Feb 15 04 : 55 : 59 + 0000 2011 utc_offset : None time_zone : None geo_enabled : True lang : None contributors_enabled : False is_translator : False profile_background_color : C0DEED profile_background_image_url : http: / / abs .twimg.com / images / themes / theme1 / bg.png profile_background_image_url_https : https: / / abs .twimg.com / images / themes / theme1 / bg.png profile_background_tile : False profile_link_color : 1DA1F2 profile_sidebar_border_color : C0DEED profile_sidebar_fill_color : DDEEF6 profile_text_color : 333333 profile_use_background_image : True profile_image_url : http: / / pbs.twimg.com / profile_images / 1112669591342211072 / rnbV0dCK_normal.jpg profile_image_url_https : https: / / pbs.twimg.com / profile_images / 1112669591342211072 / rnbV0dCK_normal.jpg profile_banner_url : https: / / pbs.twimg.com / profile_banners / 252426781 / 1522377977 default_profile : True default_profile_image : False following : None follow_request_sent : None notifications : None |
5. Tweets to csv and reading csv
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | import json import os import codecs folderpath = r "D:\Twitter Data\Data" files = os.listdir(folderpath) os.chdir(folderpath) fo = open (r "D:\Twitter Data\Data\test\tweets.csv" , "w" ) fo.write( "\ufeff" ) fo.write( "id,created_at,coordinates,co_lon,co_lat,geo,geo_lat,geo_lon," + "user_location,place_type,place_name," + "place_full_name,place_country,place_bounding_box,pb_avg_lon,pb_avg_lat," + "lang,source,text" ) count = 0 for file in files: # determine is file or directory if os.path.isdir( file ): continue count + = 1 print (count, ":" , file ) #if count < 100: # continue tweets_file = open ( file , "r" ) for line in tweets_file: try : #count += 1 #if (count < 53850): # continue tweet = json.loads(line) csv_text = "\n" # id csv_text + = tweet[ "id_str" ] csv_text + = "," # created_at csv_text + = str (tweet[ "created_at" ]) csv_text + = "," # coordinates if (tweet[ "coordinates" ]): csv_text + = "Yes," csv_text + = str (tweet[ "coordinates" ][ "coordinates" ][ 0 ]) csv_text + = "," csv_text + = str (tweet[ "coordinates" ][ "coordinates" ][ 1 ]) else : csv_text + = "None,None,None" csv_text + = "," # geo if (tweet[ "geo" ]): csv_text + = "Yes," csv_text + = str (tweet[ "geo" ][ "coordinates" ][ 0 ]) csv_text + = "," csv_text + = str (tweet[ "geo" ][ "coordinates" ][ 1 ]) else : csv_text + = "None,None,None" csv_text + = "," # user->location ul = str (tweet[ "user" ][ "location" ]) ul = ul.replace( "\n" , " " ) ul = ul.replace( "\"" , "") ul = ul.replace( "\'" , "") csv_text + = "\"" + ul + "\"" csv_text + = "," # place->type csv_text + = str (tweet[ "place" ][ "place_type" ]) csv_text + = "," # place->name csv_text + = "\"" + str (tweet[ "place" ][ "name" ]) + "\"" csv_text + = "," # place->full_name csv_text + = "\"" + str (tweet[ "place" ][ "full_name" ]) + "\"" csv_text + = "," # place->country csv_text + = "\"" + str (tweet[ "place" ][ "country" ]) + "\"" csv_text + = "," # place->bounding_box if (tweet[ "place" ][ "bounding_box" ][ "coordinates" ]): # min_lon min_lon = tweet[ "place" ][ "bounding_box" ][ "coordinates" ][ 0 ][ 0 ][ 0 ] # min_lat min_lat = tweet[ "place" ][ "bounding_box" ][ "coordinates" ][ 0 ][ 0 ][ 1 ] # max_lon max_lon = tweet[ "place" ][ "bounding_box" ][ "coordinates" ][ 0 ][ 2 ][ 0 ] # max_lat max_lat = tweet[ "place" ][ "bounding_box" ][ "coordinates" ][ 0 ][ 2 ][ 1 ] # avg of lon and lat lon = (min_lon + max_lon) / 2 lat = (min_lat + max_lat) / 2 csv_text + = "Yes," csv_text + = str (lon) csv_text + = "," csv_text + = str (lat) else : csv_text + = "None, None, None" csv_text + = "," # lang csv_text + = str (tweet[ "lang" ]) csv_text + = "," # source csv_text + = "\"" + str (tweet[ "source" ]) + "\"" csv_text + = "," # text # replace carriage return, double quotation marks, single quotation marks with space or nothing text = str (tweet[ "text" ]) text = text.replace( "\r" , " " ) text = text.replace( "\n" , " " ) text = text.replace( "\"" , "") text = text.replace( "\'" , "") csv_text + = "\"" + text + "\"" fo.write(csv_text) #if (count > 53851): # break except : continue #if count > 150: # break fo.close() import pandas as pd df = pd.read_csv( open (r "D:\Twitter Data\Data\test\tweets.csv" , encoding = 'utf-8' ,errors = 'ignore' )) df.head() |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· DeepSeek 开源周回顾「GitHub 热点速览」
· 记一次.NET内存居高不下排查解决与启示
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· .NET10 - 预览版1新功能体验(一)