Parse this custom capture with Python and create statistics

I am trying to collect twitter statistics from a specific dataset that was provided to me. I can’t control how the data is formatted before it is provided to me, so I am blocked for this randomly.

I would like to get some suggestions on how I can create a python program to parse such input and output something more in the lines of a CSV file with field headers as headings and values ​​below.

I want to use python because in the end I would like to use some statistical tools that I have already compiled.

Also, outputting the CSV format format is preferred because I can inject it into something like SPSS for statistical verification.

Here is an example of what one post from the dataset looks like:

{"text":"A gente todos os dias arruma os cabelos: por que não o coração?","contributors":null,"geo":null,"retweeted":false,"in_reply_to_screen_name":null,"truncated":false,"entities":{"urls":[],"hashtags":[],"user_mentions":[]},"in_reply_to_status_id_str":null,"id":50270714498002945,"source":"web","in_reply_to_user_id_str":null,"favorited":false,"in_reply_to_status_id":null,"created_at":"Tue Mar 22 19:00:46 +0000 2011","in_reply_to_user_id":null,"retweet_count":0,"id_str":"50270714498002945","place":null,"user":{"location":"Brasil, Recife-PE","statuses_count":16,"profile_background_tile":true,"lang":"en","profile_link_color":"867c5f","id":59154474,"following":null,"favourites_count":0,"protected":false,"profile_text_color":"91957f","verified":false,"contributors_enabled":false,"description":"","profile_sidebar_border_color":"eae2bc","name":"Natalia Aráujo","profile_background_color":"eae2bc","created_at":"Wed Jul 22 15:27:15 +0000 2009","followers_count":10,"geo_enabled":false,"profile_background_image_url":"http://a3.twimg.com/profile_background_images/220796682/music-2.png","follow_request_sent":null,"url":null,"utc_offset":-10800,"time_zone":"Brasilia","notifications":null,"profile_use_background_image":true,"friends_count":18,"profile_sidebar_fill_color":"eae2bc","screen_name":"nat_araujo","id_str":"59154474","show_all_inline_media":false,"profile_image_url":"http://a0.twimg.com/profile_images/1247378890/154254_normal.JPG","listed_count":1,"is_translator":false},"coordinates":null} 

A dataset is one continuous line in which there are no messages about the absence of rows. The only separator between the actual messages:

All messages begin with

 {"text": 

and end with

 null} 

Any suggestions will be appreciated, and of course I will be happy to share my results with everyone.


change

Based on what everyone says, I started with the following:

  import sys
 import json
 from pprint import pprint

 if len (sys.argv)! = 2:
     print 'To Use: twitterjson2cvs.py (path / filename)'
     sys.exit ()

 inputfile = open (sys.argv [1])

 jsondatain = json.load (inputfile)
 pprint (jsondatain)

 inputfile.close ()

which outputs something a little cleaner in the form:

  {u'contributors': None,
  u'coordinates': None,
  u'created_at ': u'Tue Mar 22 19:00:46 +0000 2011',
  u'entities ': {u'hashtags': [], u'urls ': [], u'user_mentions': []},
  u'favorited ': False,
  u'geo ': None,
  u'id ': 50270714498002945L,
  u'id_str ': u'50270714498002945',
  u'in_reply_to_screen_name ': None,
  u'in_reply_to_status_id ': None,
  u'in_reply_to_status_id_str ': None,
  u'in_reply_to_user_id ': None,
  u'in_reply_to_user_id_str ': None,
  u'place ': None,
  u'retweet_count ': 0,
  u'retweeted ': False,
  u'source ': u'web',
  u'text ': u'A gente todos os dias arruma os cabelos: por que n \ xe3o o cora \ xe7 \ xe3o?',
  u'truncated ': False,
  u'user ': {u'contributors_enabled': False,
            u'created_at ': u'Wed Jul 22 15:27:15 +0000 2009',
            u'description ': u' ',
            u'favourites_count ': 0,
            u'follow_request_sent ': None,
            u'followers_count ': 10,
            u'following ': None,
            u'friends_count ': 18,
            u'geo_enabled ': False,
            u'id ': 59154474,
            u'id_str ': u'59154474',
            u'is_translator ': False,
            u'lang ': u'en',
            u'listed_count ': 1,
            u'location ': u'Brasil, Recife-PE',
            u'name ': u'Natalia Ar \ xe1ujo',
            u'notifications': None,
            u'profile_background_color ': u'eae2bc',
            u'profile_background_image_url ': u'http: //a3.twimg.com/profile_background_images/220796682/music-2.png',
            u'profile_background_tile ': True,
            u'profile_image_url ': u'http: //a0.twimg.com/profile_images/1247378890/154254_normal.JPG',
            u'profile_link_color ': u'867c5f',
            u'profile_sidebar_border_color ': u'eae2bc',
            u'profile_sidebar_fill_color ': u'eae2bc',
            u'profile_text_color ': u'91957f',
            u'profile_use_background_image ': True,
            u'protected ': False,
            u'screen_name ': u'nat_araujo',
            u'show_all_inline_media ': False,
            u'statuses_count ': 16,
            u'time_zone ': u'Brasilia',
            u'url ': None,
            u'utc_offset ': -10800,
            u'verified ': False}

change

I added an attempt to output to the csv file to the previous code:

  import sys
 import json
 #from pprint import pprint
 import csv

 if len (sys.argv)! = 2:
     print 'To Use: twitterjson2cvs.py (path / filename)'
     sys.exit ()

 inputfile = open (sys.argv [1])

 jsondatain = json.load (inputfile)

 f = csv.writer (open ("test.csv", "wb +"))

 f.writerow (["contributors", "coordinates", "created_at", "entities", "hashtags", "urls", "user_mentions", "favorited", "geo", "id", "id_str", " in_reply_to_screen_name "," in_reply_to_status_id "," in_reply_to_status_id_str "," in_reply_to_user_id "," in_reply_to_user_id_str "," place "," retweet_count "," retweeted "," user, " , "created_at", "description", "favorites_count", "follow_request_sent", "followers_count", "following", friends_count "," geo_enabled "," id "," id_str "," is_translator "," lang "," listed_count "," location "," name "," notifications "," profile_background_color "," profile_background_image_url "," profile_background_imile_tile "," profile_image_url "," profile_link_color "," profile_sidebar_border_color "," profile_sidebar_fill_color "," profile_text_use_ground_use , "protected", "screen_name", "show_all_inline_media", "statuses_count", "time_zone", "url", "utc_offset", "verified"])
 for x in jsondatain:
     f.writerow ([x ["contributors"], x ["fields"] ["coordinates"], x ["fields"] ["created_at"], x ["fields"] ["entities"], x [ "fields"] ["hashtags"], x ["fields"] ["urls"], x ["fields"] ["user_mentions"], x ["fields"] ["favorited"], x ["fields "] [" geo "], x [" fields "] [" id "], x [" fields "] [" id_str "], x [" fields "] [" in_reply_to_screen_name "], x [" fields "] ["in_reply_to_status_id"], x ["fields"] ["in_reply_to_status_id_str"], x ["fields"] ["in_reply_to_user_id"], x ["fields"] ["in_reply_to_user_id_str"], x ["fields"] [ place "], x [" fields "] [" retweet_count "], x [" fields "] [" retweeted "], x [" fields "] [" source "], x [" fields "] [" text " ], x ["fields"] ["truncated"], x ["fields"] ["user"], x ["fields"] ["contributors_enabled"], x ["fields"] ["created_at"], x ["fields"] ["description"], x ["fields"] ["favorites_count"], x ["fields"] ["follow_request_sent"], x ["fields"] ["followers_count"], x [ "fields"] ["following"], x ["fields"] ["friends_count"], x ["fields"] ["geo_enabled"], x ["fields"] ["id"], x ["fields "] [" id_str "], x [" fields "] [" is_translator "], x [" fields "] [" lang "], x [" fields "] [" listed_count "  ], x ["fields"] ["location"], x ["fields"] ["name"], x ["fields"] ["notifications"], x ["fields"] ["profile_background_color"], x ["fields"] ["profile_background_image_url"], x ["fields"] ["profile_background_tile"], x ["fields"] ["profile_image_url"], x ["fields"] ["profile_link_color"], x [ "fields"] ["profile_sidebar_border_color"], x ["fields"] ["profile_sidebar_fill_color"], x ["fields"] ["profile_text_color"], x ["fields"] ["profile_use_background_image"], x ["fields "] [" protected "], x [" fields "] [" screen_name "], x [" fields "] [" show_all_inline_media "], x [" fields "] [" statuses_count "], x [" fields "] ["time_zone"], x ["fields"] ["url"], x ["fields"] ["utc_offset"], x ["fields"] ["verified"]])

 #pprint (jsondatain)

 inputfile.close ()

However, when I run it, I get:

  File "twitterjson2cvs.py", line 28, in 
     f.writerow ([x ["contributors"], x ["fields"] ["coordinates"], x ["fields"] ["created_at"], x ["fields"] ["entities"], x [ "fields"] ["hashtags"], x ["fields"] ["urls"], x ["fields"] ["user_mentions"], x ["fields"] ["favorited"], x ["fields "] [" geo "], x [" fields "] [" id "], x [" fields "] [" id_str "], x [" fields "] [" in_reply_to_screen_name "], x [" fields "] ["in_reply_to_status_id"], x ["fields"] ["in_reply_to_status_id_str"], x ["fields"] ["in_reply_to_user_id"], x ["fields"] ["in_reply_to_user_id_str"], x ["fields"] [ place "], x [" fields "] [" retweet_count "], x [" fields "] [" retweeted "], x [" fields "] [" source "], x [" fields "] [" text " ], x ["fields"] ["truncated"], x ["fields"] ["user"], x ["fields"] ["contributors_enabled"], x ["fields"] ["created_at"], x ["fields"] ["description"], x ["fields"] ["favorites_count"], x ["fields"] ["follow_request_sent"], x ["fields"] ["followers_count"], x [ "fields"] ["following"], x ["fields"] ["friends_count"], x ["fields"] ["geo_enabled"], x ["fields"] ["id"], x ["fields "] [" id_str "], x [" fields "] [" is_translator "], x [" fields "] [" lang "], x [" fields "] [" listed_count "  ], x ["fields"] ["location"], x ["fields"] ["name"], x ["fields"] ["notifications"], x ["fields"] ["profile_background_color"], x ["fields"] ["profile_background_image_url"], x ["fields"] ["profile_background_tile"], x ["fields"] ["profile_image_url"], x ["fields"] ["profile_link_color"], x [ "fields"] ["profile_sidebar_border_color"], x ["fields"] ["profile_sidebar_fill_color"], x ["fields"] ["profile_text_color"], x ["fields"] ["profile_use_background_image"], x ["fields "] [" protected "], x [" fields "] [" screen_name "], x [" fields "] [" show_all_inline_media "], x [" fields "] [" statuses_count "], x [" fields "] ["time_zone"], x ["fields"] ["url"], x ["fields"] ["utc_offset"], x ["fields"] ["verified"]])
 TypeError: string indices must be integers

The error has something to do with formatting the fields, but I don't see it.


change

I updated the code to reflect the suggestion of your format as follows:

 import sys
 import json
 import csv

 if len (sys.argv)! = 2:
     print 'To Use: twitterjson2cvs.py (path / filename)'
     sys.exit ()

 inputfile = open (sys.argv [1])

 jsondatain = json.load (inputfile)

 f = csv.writer (open ("test.csv", "wb +"))

 f.writerow (["contributors", "coordinates", "created_at", "entities", "hashtags", "urls", "user_mentions", "favorited", "geo", "id", "id_str", " in_reply_to_screen_name "," in_reply_to_status_id "," in_reply_to_status_id_str "," in_reply_to_user_id "," in_reply_to_user_id_str "," place "," retweet_count "," retweeted "," user, " , "created_at", "description", "favorites_count", "follow_request_sent", "followers_count", "following", friends_count "," geo_enabled "," id "," id_str "," is_translator "," lang "," listed_count "," location "," name "," notifications "," profile_background_color "," profile_background_image_url "," profile_background_imile_tile "," profile_image_url "," profile_link_color "," profile_sidebar_border_color "," profile_sidebar_fill_color "," profile_text_use_ground_use , "protected", "screen_name", "show_all_inline_media", "statuses_count", "time_zone", "url", "utc_offset", "verified"])
 for x in jsondatain:
     f.writerow ( 
         (   
             x ['contributors'],
             x ['coordinates'],
             x ['created_at'],
             x ['entities'] ['hashtags'],
             x ['entities'] ['urls'],
             x ['entities'] ['user_mentions'],
             x ['favorited'],
             x ['geo'],
             x ['id'],
             x ['id_str'],
             x ['in_reply_to_screen_name'],
             x ['in_reply_to_status_id'],
             x ['in_reply_to_status_id_str'],
             x ['in_reply_to_user_id'],
             x ['in_reply_to_user_id_str'],
             x ['place'],
             x ['retweet_count'],
             x ['retweeted'],
             x ['source'],
             x ['text']. encode ('utf8'),
             x ['truncated'],
             x ['user'] ['contributors_enabled'],
             x ['user'] ['created_at'],
             x ['user'] ['description'],
             x ['user'] ['favorites_count'],
             x ['user'] ['follow_request_sent'],
             x ['user'] ['followers_count'],
             x ['user'] ['following'],
             x ['user'] ['friends_count'],
             x ['user'] ['geo_enabled'],
             x ['user'] ['id'],
             x ['user'] ['id_str'],
             x ['user'] ['is_translator'],
             x ['user'] ['lang'],
             x ['user'] ['listed_count'],
             x ['user'] ['location'],
             x ['user'] ['name']. encode ('utf8'),
             x ['user'] ['notifications'],
             x ['user'] ['profile_background_color'],
             x ['user'] ['profile_background_image_url'],
             x ['user'] ['profile_background_tile'],
             x ['user'] ['profile_image_url'],
             x ['user'] ['profile_link_color'],
             x ['user'] ['profile_sidebar_border_color'],
             x ['user'] ['profile_sidebar_fill_color'],
             x ['user'] ['profile_text_color'],
             x ['user'] ['profile_use_background_image'],
             x ['user'] ['protected'],
             x ['user'] ['screen_name'],
             x ['user'] ['show_all_inline_media'],
             x ['user'] ['statuses_count'],
             x ['user'] ['time_zone'],
             x ['user'] ['url'],
             x ['user'] ['utc_offset'],
             x ['user'] ['verified']
         )
     )


 inputfile.close ()

I still get the following error:

 twitterjson2cvs.py TweetFile1300820340639.tcm.online 
 Traceback (most recent call last):
   File "workspace / coalmine-datafilter / src / twitterjson2csv.py", line 30, in 
     x ['contributors'],
 TypeError: string indices must be integers


change

Everything works fine up to this point for a single input file with json formatting. The previous json string example introduced into this program:

  import sys
 import json
 import csv

 if len (sys.argv)! = 2:
     print 'To Use: twitterjson2cvs.py (path / filename)'
     sys.exit ()

 inputfile = open (sys.argv [1])

 jsonindata = json.load (inputfile)

 f = csv.writer (open ("test.csv", "wb +"))

 f.writerow (["contributors", "coordinates", "created_at", "entities", "hashtags", "urls", "user_mentions", "favorited", "geo", "id", "id_str", " in_reply_to_screen_name "," in_reply_to_status_id "," in_reply_to_status_id_str "," in_reply_to_user_id "," in_reply_to_user_id_str "," place "," retweet_count "," retweeted "," user, " , "created_at", "description", "favorites_count", "follow_request_sent", "followers_count", "following", friends_count "," geo_enabled "," id "," id_str "," is_translator "," lang "," listed_count "," location "," name "," notifications "," profile_background_color "," profile_background_image_url "," profile_background_imile_tile "," profile_image_url "," profile_link_color "," profile_sidebar_border_color "," profile_sidebar_fill_color "," profile_text_use_ground_use , "protected", "screen_name", "show_all_inline_media", "statuses_count", "time_zone", "url", "utc_offset", "verified"])

 f.writerow ( 
         (
             jsonindata ['contributors'],
             jsonindata ['coordinates'],
             jsonindata ['created_at'],
             jsonindata ['entities'] ['hashtags'],
             jsonindata ['entities'] ['urls'],
             jsonindata ['entities'] ['user_mentions'],
             jsonindata ['favorited'],
             jsonindata ['geo'],
             jsonindata ['id'],
             jsonindata ['id_str'],
             jsonindata ['in_reply_to_screen_name'],
             jsonindata ['in_reply_to_status_id'],
             jsonindata ['in_reply_to_status_id_str'],
             jsonindata ['in_reply_to_user_id'],
             jsonindata ['in_reply_to_user_id_str'],
             jsonindata ['place'],
             jsonindata ['retweet_count'],
             jsonindata ['retweeted'],
             jsonindata ['source'],
             jsonindata ['text']. encode ('utf8'),
             jsonindata ['truncated'],
             jsonindata ['user'] ['contributors_enabled'],
             jsonindata ['user'] ['created_at'],
             jsonindata ['user'] ['description'],
             jsonindata ['user'] ['favorites_count'],
             jsonindata ['user'] ['follow_request_sent'],
             jsonindata ['user'] ['followers_count'],
             jsonindata ['user'] ['following'],
             jsonindata ['user'] ['friends_count'],
             jsonindata ['user'] ['geo_enabled'],
             jsonindata ['user'] ['id'],
             jsonindata ['user'] ['id_str'],
             jsonindata ['user'] ['is_translator'],
             jsonindata ['user'] ['lang'],
             jsonindata ['user'] ['listed_count'],
             jsonindata ['user'] ['location'],
             jsonindata ['user'] ['name']. encode ('utf8'),
             jsonindata ['user'] ['notifications'],
             jsonindata ['user'] ['profile_background_color'],
             jsonindata ['user'] ['profile_background_image_url'],
             jsonindata ['user'] ['profile_background_tile'],
             jsonindata ['user'] ['profile_image_url'],
             jsonindata ['user'] ['profile_link_color'],
             jsonindata ['user'] ['profile_sidebar_border_color'],
             jsonindata ['user'] ['profile_sidebar_fill_color'],
             jsonindata ['user'] ['profile_text_color'],
             jsonindata ['user'] ['profile_use_background_image'],
             jsonindata ['user'] ['protected'],
             jsonindata ['user'] ['screen_name'],
             jsonindata ['user'] ['show_all_inline_media'],
             jsonindata ['user'] ['statuses_count'],
             jsonindata ['user'] ['time_zone'],
             jsonindata ['user'] ['url'],
             jsonindata ['user'] ['utc_offset'],
             jsonindata ['user'] ['verified']
         )
     )

 inputfile.close ()

results in a well-formatted output ready for tools such as SPSS to use as follows:

  contributors, coordinates, created_at, entities, hashtags, urls, user_mentions, favorited, geo, id, id_str, in_reply_to_screen_name, in_reply_to_status_id, in_reply_to_status_id_str, in_reply_went_ter_teruser_teruser_teruser_teruser_teruser_ider_teruser description, favorites_count, follow_request_sent, followers_count, following, friends_count, geo_enabled, id, id_str, is_translator, lang, listed_count, location, name, notifications, profile_background_color, profile_background_image_url, profile_field_ground_color_ground_ground_ground_ground_ground_ground_ground_bar_ground screen_name, show_all_inline_media, statuses_count, time_zone, url, utc_offset, verified
 ,, Tue Mar 22 19:00:46 +0000 2011, [], [], [], False ,, 50270714498002945,50270714498002945 ,,,,,, 0, False, web, A gente todos os dias arruma os cabelos : por que não o coração?, False, False, Wed Jul 22 15:27:15 +0000 2009,, 0,, ​​10,, 18, False, 59154474,59154474, False, en, 1, "Brasil, Recife- PE ", Natalia Aráujo ,, eae2bc, http: //a3.twimg.com/profile_background_images/220796682/music-2.png,True,http: //a0.twimg.com/profile_images/1247378890/154254_normal.JPG,867c , eae2bc, eae2bc, 91957f, True, False, nat_araujo, False, 16, Brasilia ,, - 10800, False

Now the only problem is that I have input files that have multiple json lines connected to each other on one continuous line. When I try to run the same program in these files, I get the following error:

  Traceback (most recent call last):
   File "workspace / coalmine-datafilter / src / twitterjson2cvs.py", line 22, in 
     jsonindata = json.load (inputfile)
   File "/usr/lib/python2.6/json/__init__.py", line 267, in load
     parse_constant = parse_constant, ** kw)
   File "/usr/lib/python2.6/json/__init__.py", line 307, in loads
     return _default_decoder.decode (s)
   File "/usr/lib/python2.6/json/decoder.py", line 322, in decode
     raise ValueError (errmsg ("Extra data", s, end, len (s)))
 ValueError: Extra data: line 1 column 1514 - line 2 column 1 (char 1514 - 2427042)

The input file is very large (i.e. several thousand messages on Twitter), I don’t know if the error was caused due to the number of messages or if this is due to the fact that the file has several {"...."} {". ... "} all on one line. Any ideas? Perhaps I need to add a line, somehow after each feed?

+7
source share
2 answers

this should help you get started ... you will need to take care of the nested objects

 import json import csv f = file('test.json', 'r') data = json.load(f) #result = [] for k,v in data.iteritems(): print k,v #result.append(v) f = file('output.csv', 'w') writer = csv.writer(f) writer.writerows(result) 
+4
source

The input here is JSON. Python has a JSON module . Fortunately, it has a CSV module . So your entry and exit have been taken care of!

Update

You get!

Your writerow() call should look bigger (not in for -loop):

 f.writerow( ( jsonindata['contributors'], jsonindata['coordinates'], jsonindata['created_at'], jsonindata['entities']['hashtags'], jsonindata['entities']['urls'], jsonindata['entities']['user_mentions'], jsonindata['favorited'], jsonindata['geo'], jsonindata['id'], jsonindata['id_str'], jsonindata['in_reply_to_screen_name'], jsonindata['in_reply_to_status_id'], jsonindata['in_reply_to_status_id_str'], jsonindata['in_reply_to_user_id'], jsonindata['in_reply_to_user_id_str'], jsonindata['place'], jsonindata['retweet_count'], jsonindata['retweeted'], jsonindata['source'], jsonindata['text'].encode('utf8'), jsonindata['truncated'], jsonindata['user']['contributors_enabled'], jsonindata['user']['created_at'], jsonindata['user']['description'], jsonindata['user']['favourites_count'], jsonindata['user']['follow_request_sent'], jsonindata['user']['followers_count'], jsonindata['user']['following'], jsonindata['user']['friends_count'], jsonindata['user']['geo_enabled'], jsonindata['user']['id'], jsonindata['user']['id_str'], jsonindata['user']['is_translator'], jsonindata['user']['lang'], jsonindata['user']['listed_count'], jsonindata['user']['location'], jsonindata['user']['name'].encode('utf8'), jsonindata['user']['notifications'], jsonindata['user']['profile_background_color'], jsonindata['user']['profile_background_image_url'], jsonindata['user']['profile_background_tile'], jsonindata['user']['profile_image_url'], jsonindata['user']['profile_link_color'], jsonindata['user']['profile_sidebar_border_color'], jsonindata['user']['profile_sidebar_fill_color'], jsonindata['user']['profile_text_color'], jsonindata['user']['profile_use_background_image'], jsonindata['user']['protected'], jsonindata['user']['screen_name'], jsonindata['user']['show_all_inline_media'], jsonindata['user']['statuses_count'], jsonindata['user']['time_zone'], jsonindata['user']['url'], jsonindata['user']['utc_offset'], jsonindata['user']['verified'] ) ) 

Also consider using DictWriter , but remember that the Python CSV module does not work well with Unicode , therefore .encode('utf8') for a couple of tuple elements.

+4
source

All Articles