TwitterScrape/twitterbot.py at main · li21rich/TwitterScrape · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import time, random
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


def increment_month(date_str):
	date = datetime.strptime(date_str, "%Y-%m-%d")
	year, month = date.year, date.month
	if month == 12:
		year += 1
		month = 1
	else:
		month += 1
	return datetime(year, month, 1).strftime("%Y-%m-%d")

def human_type(bot, element, text):
	actions = ActionChains(bot)
	actions.move_to_element(element).click().perform()
	for char in text:
		element.send_keys(char)
		time.sleep(random.uniform(0.02, 0.2))  # simulate human typing


class Twitterbot:
	def __init__(self, email, password, username, headless):
		self.email = email
		self.password = password
		self.username = username
		chrome_options = webdriver.ChromeOptions()
		if headless.lower() == "yes": chrome_options.add_argument("--headless")
		chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
		chrome_options.add_experimental_option("useAutomationExtension", False)
		self.bot = webdriver.Chrome(
			service=Service(ChromeDriverManager().install()),
			options=chrome_options
		)

	def login_with_email(self):
		bot = self.bot
		bot.get('https://twitter.com/i/flow/login')
		email_field = WebDriverWait(bot, 10).until(ec.visibility_of_element_located(('xpath', '//input[@autocomplete="username"]')))
		email_field.send_keys(self.email, Keys.RETURN)
		try:
			username_field = WebDriverWait(bot, 5).until(ec.presence_of_element_located(('xpath', '//input[@autocomplete="on" and @name="text"]')))
			username_field.send_keys(self.username, Keys.RETURN)
		except Exception as e:
			pass
		password_field = WebDriverWait(bot, 10).until(ec.presence_of_element_located(('xpath', '//input[@autocomplete="current-password"]')))
		password_field.send_keys(self.password, Keys.RETURN)
		time.sleep(1)

	def login_with_username(self): # old version
		bot = self.bot
		bot.get('https://twitter.com/i/flow/login')
		email_field = WebDriverWait(bot, 10).until(ec.presence_of_element_located(('xpath', '//input[@autocomplete="username"]')))
		actions = ActionChains(bot)
		actions.move_to_element(email_field).click().pause(0.5).send_keys("myemail@example.com").perform()
		email_field.send_keys(self.username, Keys.RETURN)
		password_field = WebDriverWait(bot, 10).until(ec.presence_of_element_located(('xpath', '//input[@autocomplete="current-password"]')))
		password_field.send_keys(self.password, Keys.RETURN)
		time.sleep(1)

	def login_with_username_like_human(self):
		bot = self.bot
		bot.get('https://twitter.com/i/flow/login')

		# Username/email field
		username_field = WebDriverWait(bot, 10).until(
			ec.presence_of_element_located(('xpath', '//input[@name="text"]'))
		)
		human_type(bot, username_field, self.username)
		username_field.send_keys(Keys.RETURN)

		# Password field
		password_field = WebDriverWait(bot, 10).until(
			ec.presence_of_element_located(('xpath', '//input[@name="password"]'))
		)
		human_type(bot, password_field, self.password)
		password_field.send_keys(Keys.RETURN)

		time.sleep(1)

	def scrape(self, mintweets, query):
		bot = self.bot
		bot.get(query)
		print("\nScanning:", query)
		WebDriverWait(bot, 10).until(ec.presence_of_element_located(('xpath', "//article[@data-testid='tweet']")))
		count = 0
		results = ""
		while count < int(mintweets):
			bot.execute_script('window.scrollTo(0, document.body.scrollHeight)')
			time.sleep(1)
			elements = WebDriverWait(bot, 10).until(ec.presence_of_all_elements_located(('xpath', "//article[@data-testid='tweet']")))
			for elem in elements:
				text = str(elem.find_element("xpath", ".//div[@data-testid='tweetText']").text)
				analytic = str(elem.find_element("xpath", ".//div[@role='group']").get_attribute('aria-label'))
				date = str(elem.find_element("xpath", ".//time").get_attribute('datetime'))
				count += 1
				result = str(":::> #" + str(count) + ". " + analytic + " " + date + " <:::\n" + text)
				print(result)
				results += result + "\n"
		with open("output.txt", "w", encoding="utf-8") as file: file.write(results)
		return results

	def advanced_scrape(self, startingDate, endingDate, months, cap, in_query, sort):  # sorts by popular from start date to end date for each month
		bot = self.bot
		start_date = startingDate  # it might be a smart idea to modify this code to run by days instead of months, because advanced scrape tends to bias tweets posted at the end of every month.
		end_date = endingDate
		count = 0
		results = ""
		if sort.lower() == "yes":
			sort = "&f=top"  # sort by popular/trending
		else:
			sort = "&f=live"  # sort by latest/recent
		for i in range(int(months)): # i.e. 24 = scrape from each of the 24 months
			query = in_query + "20since%3A" + start_date + "%20until%3A" + end_date + "&src=typed_query" + sort
			bot.get(query)
			print("Scanning from", start_date, "to", end_date)
			try:
				WebDriverWait(bot, 10).until(ec.presence_of_element_located(('xpath', "//article[@data-testid='tweet']")))
				i = 0
				while i < int(cap):  # continue jumps to here. 50 is recommended value for cap. Higher values may cause errors if the amount of tweets in a month is exceeded by the cap.
					bot.execute_script('window.scrollTo(0, document.body.scrollHeight)')
					time.sleep(1)
					elements = WebDriverWait(bot, 10).until(ec.presence_of_all_elements_located(('xpath', "//article[@data-testid='tweet']")))
					for elem in elements:
						text = str(elem.find_element("xpath", ".//div[@data-testid='tweetText']").text)
						text = text.replace("\n\n", "\n").replace("\n\n", "\n")
						analytic = str(elem.find_element("xpath", ".//div[@role='group']").get_attribute('aria-label'))
						date = str(elem.find_element("xpath", ".//time").get_attribute('datetime'))
						date = date.split('T')[0]
						i += 1
						count += 1
						result = str(":::> #" + str(count) + ". " + analytic + " " + date + " <:::\n" + text)
						print(result)
						results += result + "\n"
						if not i < int(cap):
							break
				start_date = end_date
				end_date = increment_month(end_date)
			except Exception as e:
				print("It seems you've been rate limited. You left off scraping from", start_date, "to", end_date)
				print("No worries. The program will automatically continue your search in 15 minutes.")
				time.sleep(15 * 60)
				continue
		with open("output.txt", "w", encoding="utf-8") as file: file.write(results)
		return results