-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgoodreads.py
More file actions
157 lines (131 loc) · 4.75 KB
/
goodreads.py
File metadata and controls
157 lines (131 loc) · 4.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import requests
import urllib.request
import time
import re
import collections
from bs4 import BeautifulSoup
from multiprocessing import Pool
Book = collections.namedtuple('Book', ['stars', 'votes', 'user_avg', 'user_count'])
books = {}
scores = {}
with open('books.csv', 'r') as book_csv:
for line in book_csv:
stars, votes, name = line.strip().split(',')
books[name] = (float(stars), int(votes))
print(len(books))
# Our model is (5 - stars)/(5 - user_avg) is the true score.
# However, most good scores by this measure will be outliers.
# To build a prior you might look at all books with at least x ratings (say 400)
# Then try to find a fast way to get a posterior from this given the observed
# rating. Instead, we will use the following ad-hoc thing:
# We calculate stars2 which is such that the chance of getting stars or above
# is 1/x. We want x to be something like (# books checked)/position.
# Assuming stars is 5 - p this works out to a quadratic in p.
# p^2 (n^2 + ns^2) - p (2nk + ns^2) + k^2 = 0
# Here s = number of standard deviations (about 4)
# k = n*(5 - stars)
def score(stars, votes, user_avg):
if votes == 0:
return 1
s = 4
a = votes**2 + s**2 * votes
b = 2*votes**2 * (5 - stars) + s**2 * votes
c = votes**2 * (5 - stars)**2
if b**2 - 4*a*c < 0:
return 1
p = (b + (b**2 - 4*a*c)**0.5)/(2*a)
# Mean user rating is around 3.9
return p/(5 - user_avg)*1.1
book_list = [(name, books[name][0], books[name][1]) for name in books]
book_list.sort(key=lambda book: score(book[1], book[2], 4))
book_list = book_list[:2000]
book_count = 0
all_users = {}
user_checked = {}
with open('data.csv', 'r') as data_csv:
for line in data_csv:
stars, votes, user_avg, user_count, users, name = line.strip().split(',')
user_checked[name] = Book(
float(stars), int(votes), float(user_avg), int(user_count))
checked_list = [(name, params) for name, params in user_checked.items()]
checked_list.sort(key=lambda pair:
score(pair[1].stars, pair[1].votes, pair[1].user_avg))
for pair in checked_list[:100]:
name, book = pair
print("%f %s" % (score(book.stars, book.votes, book.user_avg), name))
root = "https://www.goodreads.com"
book_str = "/book/show/"
user_str = "/user/show/"
lst = "10198"
def parse_user(user):
# Save a little work with a memo
global all_users
if user in all_users:
return all_users[user]
# Get page
user_url = root + user_str + user
user_response = requests.get(user_url)
user_soup = BeautifulSoup(user_response.text, 'html.parser')
# This must handle
# public - /user/show/42390
# private - /user/show/564646
# author - /user/show/167451
refs = user_soup.select('a')
# Pick the link with text "avg"
for ref in refs:
text = ref.get_text()
if text.find('avg') >= 0:
# Extract float
num = re.findall("\d+\.\d+", text)
avg = float(num[0])
all_users[user] = avg
# Careful if a user with no ratings writes a non-rating review
if avg < 1:
all_users[user] = None
return None
with open('users.csv', 'a+') as user_csv:
user_csv.write("%.2f,%s" % (avg, user) + '\n')
return avg
return None
# TODO kill session
#s = requests.session()
#s.config['keep_alive'] = False
def parse_book(book):
name, stars, votes = book
global user_checked
if name in user_checked:
return
# Get page
book_url = root + book_str + name
book_response = requests.get(book_url)
book_soup = BeautifulSoup(book_response.text, 'html.parser')
# TODO get avg, num ratings
#rating_node = book_soup.select('[itemprop=ratingValue]')
#rating = float(rating_node[0].get_text())
# Get users reviewing this book
reviews = book_soup.select('a.user')
users = [review['href'] for review in reviews]
for pos, user in enumerate(users):
num = re.findall("\d+", user)
users[pos] = num[0]
# Get users in parallel
p = Pool(30) # Pool tells how many at a time
avgs = p.map(parse_user, users)
p.terminate()
p.join()
#avgs = [parse_user(user) for user in users]
avgs = [avg for avg in avgs if avg != None]
user_avg = sum(avgs)/len(avgs)
# Write this down in case we fail
with open('data.csv', 'a+') as data_csv:
data_csv.write("%.2f,%d,%f,%d,%s,%s"
% (stars, votes, user_avg, len(users), ' '.join(users), name)
+ '\n')
scores[name] = score(stars, votes, user_avg)
print("%f %s" % (scores[name], name))
for book in book_list:
parse_book(book)
#p = Pool(30) # Pool tells how many at a time
#p.map(parse_book, book_list)
#p.terminate()
#p.join()