BuzzFace/threaded_comments.py at master · gsantia/BuzzFace · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/python
import csv, httplib, glob, json, os, re, urllib2, time
from collections import Counter
import sys

APP_ID = str(sys.argv[1])
APP_SECRET = str(sys.argv[2])

def createPostCommentsUrl(COMMENT_ID, APP_ID, APP_SECRET):
    comments_args = "/comments?access_token=" + APP_ID + "|" + APP_SECRET + \
                    "&order=chronological&limit=1000"
    #there's an article with 159047 comments
    comments_url = "https://graph.facebook.com/" + COMMENT_ID + comments_args
    return comments_url

def createAnon(USER_ID, APP_ID, APP_SECRET):
    url = "https://graph.facebook.com/" + USER_ID + "/third_party_id?access_token=" \
            + APP_ID + "|" + APP_SECRET
    return url

def getAnon(USER_ID):
    anon_url = createAnon(USER_ID, APP_ID, APP_SECRET)
    web_response = urllib2.urlopen(anon_url)
    readable_page = web_response.read()
    return json.loads(readable_page)

def openPage(webpage):
    """properly open webpages in Python"""
    web_response = None
    hdr = {'User-agent' : 'Mozilla/5.0'}
    req = urllib2.Request(webpage, headers=hdr)
    try:
        web_response = urllib2.urlopen(req)
    except urllib2.HTTPError, e:
        print e.fp.read()
    except urllib2.URLError, e:
        print "URL ERROR: ", webpage
    except httplib.HTTPException, e:
        print "HTTP EXCEPTION: ", webpage
    except Exception, e:
        print "other exception loading the page: ", webpage

    return web_response


def getPostComments(COMMENT_ID, n):
    """returns a JSON of a comment's comments"""
    data = []
    post_url = createPostCommentsUrl(COMMENT_ID, APP_ID, APP_SECRET)
    batchnum = 1

    time.sleep(n)
    try:
        web_response = openPage(post_url)
        readable_page = web_response.read()
        thisdata = json.loads(readable_page)
        data.extend(thisdata['data'])
    except:
        print "comments didn't work at all!"
        #in this case, just return an empty dict
        return {}

    if 'paging' in thisdata.keys():
        while thisdata['paging'].get('next', False):
            batchnum += 1
            time.sleep(n)
            print "comments batch: ", batchnum
            try:
                web_response = openPage(thisdata['paging']['next'])
                thisdata = json.loads(web_response.read())
                if thisdata['data']:
                    data.extend(thisdata['data'])
            except:
                print "problem getting extra batches"
    comments = {"data" : data}
    return comments

def retrieve(outlet):
    """initiate the retrieval of the comment replies for a specific _outlet_"""

    path = './dataset/'
    files = glob.glob(path + outlet + '/*/comments.json')
    for i, filename in enumerate(files[:10]):
        print "working on %s" % str(i)
        #open the old comments file
        try:
            with open(filename, 'r') as f:
                commentsJSON = json.load(f)
        except:
            print "problem opening comments.json"
            continue

        commentsJSON = commentsJSON.get('data', []) #this is just a list
        for j, comment in enumerate(commentsJSON):
            print "          comment #: ", str(j)
            COMMENT_ID = comment['id']
            #add the replies key
            # NOTE: this will take an extremely long time with the new API limit,
            # so this has been commented out. if you'd like to wait and get every single
            # reply, uncomment this and get rid of the dummy replies line following
            #comment['replies'] = getPostComments(COMMENT_ID, n).get('data', [])
            comment['replies'] = []
        #now write the new file to disk
        new_filename = filename[:-13] + 'replies.json' #chop off the 'comments.json'
        os.system('touch ' + new_filename)
        with open(new_filename, 'w') as f2:
            json.dump(commentsJSON, f2, indent = 4, sort_keys = True)

if __name__ == '__main__':
    n = 20
    # This n is again the number of seconds the script sleeps after each API
    # request. Change at your own risk.
    #
    # NOTE: changed from 1 to 20 due to changes in FB API

    outlets = ['ABC_News_Politics',
               'Politico',
               'CNN_Politics',
               'Addicting_Info',
               'Eagle_Rising',
               'Freedom_Daily',
               'Occupy_Democrats',
               'Right_Wing_News',
               'The_Other_98%']

    for outlet in outlets:
        if os.path.isdir('./dataset/' + outlet):
            retrieve(outlet)