-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathStack Overflow Questions & Answers
More file actions
88 lines (73 loc) · 3.57 KB
/
Stack Overflow Questions & Answers
File metadata and controls
88 lines (73 loc) · 3.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
--Dataset: "stackoverflow" from BigQuery
--Source: https://www.kaggle.com/datasets/stackoverflow/stackoverflow
--Queried using: Kaggle Notebook
from google.cloud import bigquery
# Create a "Client" object
client = bigquery.Client()
# Construct a reference to the "stackoverflow" dataset
dataset_ref = client.dataset("stackoverflow", project="bigquery-public-data")
# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)
# Construct a reference to the "posts_questions" table
table_ref = dataset_ref.table("posts_questions")
# API request - fetch the table
table = client.get_table(table_ref)
# Preview the first five lines of the table
client.list_rows(table, max_results=5).to_dataframe()
# Construct a reference to the "posts_answers" table
table_ref = dataset_ref.table("posts_answers")
# API request - fetch the table
table = client.get_table(table_ref)
# Preview the first five lines of the table
client.list_rows(table, max_results=5).to_dataframe()
# How long does it take for questions to receive answers?
correct_query = """
SELECT q.id AS q_id,
MIN(TIMESTAMP_DIFF(a.creation_date, q.creation_date, SECOND)) as time_to_answer
FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
LEFT JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
ON q.id = a.parent_id
WHERE q.creation_date >= '2018-01-01' and q.creation_date < '2018-02-01'
GROUP BY q_id
ORDER BY time_to_answer
"""
# Run the query, and return a pandas DataFrame
correct_result = client.query(correct_query).result().to_dataframe()
print("Percentage of answered questions: %s%%" % \
(sum(correct_result["time_to_answer"].notnull()) / len(correct_result) * 100))
print("Number of questions:", len(correct_result))
# Which users have only asked questions or provided answers -- but not both?
q_and_a_query = """
SELECT q.owner_user_id AS owner_user_id,
MIN(q.creation_date) AS q_creation_date,
MIN(a.creation_date) AS a_creation_date
FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
FULL JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
ON q.owner_user_id = a.owner_user_id
WHERE q.creation_date >= '2019-01-01' AND q.creation_date < '2019-02-01'
AND a.creation_date >= '2019-01-01' AND a.creation_date < '2019-02-01'
GROUP BY owner_user_id
"""
# Query activity of users who joined the site in January 2019
three_tables_query = """
SELECT u.id AS id,
MIN(q.creation_date) AS q_creation_date,
MIN(a.creation_date) AS a_creation_date
FROM `bigquery-public-data.stackoverflow.users` AS u
LEFT JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
ON a.owner_user_id = u.id
LEFT JOIN `bigquery-public-data.stackoverflow.posts_questions` AS q
ON q.owner_user_id = u.id
WHERE u.creation_date >= '2019-01-01' AND u.creation_date < '2019-02-01'
GROUP BY id
"""
# How many distinct users posted on January 1, 2019?
all_users_query = """
SELECT q.owner_user_id
FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
WHERE EXTRACT(DATE FROM q.creation_date) = '2019-01-01'
UNION DISTINCT
SELECT a.owner_user_id
FROM `bigquery-public-data.stackoverflow.posts_answers` AS a
WHERE EXTRACT(DATE FROM a.creation_date) = '2019-01-01'
"""