From 33d8cf4391f3c1ecedd6bb60f5d6f5abe490c49e Mon Sep 17 00:00:00 2001 From: 4sakura <4sakura@web.de> Date: Fri, 23 Mar 2018 20:02:04 +0100 Subject: [PATCH] add argparse You no longer need to edit scrape.py to search for another user or to change start or end date. Now you simply use the args `-u`,`--since` and `--until` --- README.md | 7 +++++-- scrape.py | 31 +++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 7b399d8..d76cfbb 100644 --- a/README.md +++ b/README.md @@ -23,8 +23,11 @@ The `scrape.py` script collects tweet ids. If you know a tweet's id number, you ## Running the scraper -- open up `scrape.py` and edit the user, start, and end variables (and save the file) -- run `python3 scrape.py` +- ~~open up `scrape.py` and edit the user, start, and end variables (and save the file)~~ +- run `python3 scrape.py` and add the arguments you need + - `-u` followed by the username + - `--since` followed by a string of the date e.g (2017-01-01) otherwise it uses the default value declared in start + - `--until` followed by a string of the date e.g (2018-01-01) otherwise it uses the default value declared in end - you'll see a browser pop up and output in the terminal - do some fun other task until it finishes - once it's done, it outputs all the tweet ids it found into `all_ids.json` diff --git a/scrape.py b/scrape.py index 784e087..acab52b 100644 --- a/scrape.py +++ b/scrape.py @@ -4,16 +4,26 @@ from time import sleep import json import datetime - - -# edit these three variables -user = 'realdonaldtrump' -start = datetime.datetime(2010, 1, 1) # year, month, day -end = datetime.datetime(2016, 12, 7) # year, month, day +import argparse + +parser = argparse.ArgumentParser(prog="scrape.py", usage="python3 %(prog)s [options]", description="scrape.py - Twitter Scraping Tool") +parser.add_argument("-u", help="Scrape this user's Tweets") +parser.add_argument("--since", help="Get Tweets after this date (Example: 2010-01-01).") +parser.add_argument("--until", help="Get Tweets before this date (Example: 2018-12-07.") +args = parser.parse_args() + +if args.since is not None: + start = datetime.datetime(int(args.since[:4]), int(args.since[5:7]), int(args.since[8:10])) +else: + start = datetime.datetime(2010, 1, 1) #year, month, day; this is the fallback date; only used when there is no argument `--since` +if args.until is not None: + end = datetime.datetime(int(args.until[:4]), int(args.until[5:7]), int(args.until[8:10])) +else: + end = datetime.datetime.now() # only edit these if you're having problems delay = 1 # time to wait on each page load before reading the page -driver = webdriver.Safari() # options are Chrome() Firefox() Safari() +driver = webdriver.Firefox() # options are Chrome() Firefox() Safari() # don't mess with this stuff @@ -21,7 +31,6 @@ days = (end - start).days + 1 id_selector = '.time a.tweet-timestamp' tweet_selector = 'li.js-stream-item' -user = user.lower() ids = [] def format_day(date): @@ -31,8 +40,10 @@ def format_day(date): return '-'.join([year, month, day]) def form_url(since, until): - p1 = 'https://twitter.com/search?f=tweets&vertical=default&q=from%3A' - p2 = user + '%20since%3A' + since + '%20until%3A' + until + 'include%3Aretweets&src=typd' + p1 = 'https://twitter.com/search?f=tweets&vertical=default&q=' + if args.u is not None: + p1 += "from%3A{0.u}".format(args) + p2 ='%20since%3A' + since + '%20until%3A' + until + 'include%3Aretweets&src=typd' return p1 + p2 def increment_day(date, i):