Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@ The `scrape.py` script collects tweet ids. If you know a tweet's id number, you

## Running the scraper

- open up `scrape.py` and edit the user, start, and end variables (and save the file)
- run `python3 scrape.py`
- <del>~~open up `scrape.py` and edit the user, start, and end variables (and save the file)~~</del>
- run `python3 scrape.py` and add the arguments you need
- `-u` followed by the username
- `--since` followed by a string of the date e.g (2017-01-01) otherwise it uses the default value declared in start
- `--until` followed by a string of the date e.g (2018-01-01) otherwise it uses the default value declared in end
- you'll see a browser pop up and output in the terminal
- do some fun other task until it finishes
- once it's done, it outputs all the tweet ids it found into `all_ids.json`
Expand Down
31 changes: 21 additions & 10 deletions scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,33 @@
from time import sleep
import json
import datetime


# edit these three variables
user = 'realdonaldtrump'
start = datetime.datetime(2010, 1, 1) # year, month, day
end = datetime.datetime(2016, 12, 7) # year, month, day
import argparse

parser = argparse.ArgumentParser(prog="scrape.py", usage="python3 %(prog)s [options]", description="scrape.py - Twitter Scraping Tool")
parser.add_argument("-u", help="Scrape this user's Tweets")
parser.add_argument("--since", help="Get Tweets after this date (Example: 2010-01-01).")
parser.add_argument("--until", help="Get Tweets before this date (Example: 2018-12-07.")
args = parser.parse_args()

if args.since is not None:
start = datetime.datetime(int(args.since[:4]), int(args.since[5:7]), int(args.since[8:10]))
else:
start = datetime.datetime(2010, 1, 1) #year, month, day; this is the fallback date; only used when there is no argument `--since`
if args.until is not None:
end = datetime.datetime(int(args.until[:4]), int(args.until[5:7]), int(args.until[8:10]))
else:
end = datetime.datetime.now()

# only edit these if you're having problems
delay = 1 # time to wait on each page load before reading the page
driver = webdriver.Safari() # options are Chrome() Firefox() Safari()
driver = webdriver.Firefox() # options are Chrome() Firefox() Safari()


# don't mess with this stuff
twitter_ids_filename = 'all_ids.json'
days = (end - start).days + 1
id_selector = '.time a.tweet-timestamp'
tweet_selector = 'li.js-stream-item'
user = user.lower()
ids = []

def format_day(date):
Expand All @@ -31,8 +40,10 @@ def format_day(date):
return '-'.join([year, month, day])

def form_url(since, until):
p1 = 'https://twitter.com/search?f=tweets&vertical=default&q=from%3A'
p2 = user + '%20since%3A' + since + '%20until%3A' + until + 'include%3Aretweets&src=typd'
p1 = 'https://twitter.com/search?f=tweets&vertical=default&q='
if args.u is not None:
p1 += "from%3A{0.u}".format(args)
p2 ='%20since%3A' + since + '%20until%3A' + until + 'include%3Aretweets&src=typd'
return p1 + p2

def increment_day(date, i):
Expand Down