-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrapeui.py
More file actions
169 lines (134 loc) · 6.47 KB
/
scrapeui.py
File metadata and controls
169 lines (134 loc) · 6.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import gradio as gr
from curl_cffi import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
def scrape_company_fast(company_link):
"""Worker function to scrape a single company concurrently."""
# handle case where link is relative or absolute
if company_link.startswith("http"):
url = company_link
else:
url = f"https://www.ycombinator.com{company_link}"
try:
# impersonate="chrome" tricks Cloudflare into thinking we are a real browser
response = requests.get(url, impersonate="chrome", timeout=10)
if response.status_code != 200:
return None
soup = BeautifulSoup(response.content, "html.parser")
# Fast CSS selector for social links
social_links = soup.select('a[href*="x.com"], a[href*="twitter.com"]')
found_handles = []
for link in social_links:
href = link.get('href', '')
if "intent/tweet" not in href and "share" not in href:
found_handles.append(href)
return list(set(found_handles)) # Remove duplicates
except Exception as e:
print(f"Failed to scrape {url}: {e}")
return None
def scroll_to_load_all_companies(driver):
"""Scroll down the page to load all companies."""
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2) # Wait for the page to load
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def scrape_twitter_urls(start_url):
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
# Added User Agent to avoid being blocked as a bot
chrome_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
service = ChromeService(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
try:
driver.get(start_url)
time.sleep(3)
# --- BLOCK 1: GET COMPANY LINKS ---
company_links = []
soup = BeautifulSoup(driver.page_source, "html.parser")
# Fallback method (Most reliable for YC's current layout)
all_links = soup.find_all('a', href=True)
for link in all_links:
href = link['href']
# Check if it looks like a company profile link
if '/companies/' in href and href not in company_links:
company_links.append(href)
if not company_links:
return [], 0, 0, 0, "Error: Could not find company links. Page structure may have changed."
# --- BLOCK 2: SCROLLING ---
if company_links:
scroll_to_load_all_companies(driver)
# Re-parse after scrolling to get all companies
soup = BeautifulSoup(driver.page_source, "html.parser")
all_links = soup.find_all('a', href=True)
company_links = []
for link in all_links:
href = link['href']
if '/companies/' in href and href not in company_links:
company_links.append(href)
# Close Selenium driver early as we don't need it for individual pages anymore
driver.quit()
twitter_handles = []
companies_with_no_twitter = 0
# --- BLOCK 3: EXTRACT TWITTER HANDLES (FAST PARALLEL) ---
print(f"Found {len(company_links)} companies. Starting fast scrape with curl_cffi...")
# We scrape 10 companies at the same time
with ThreadPoolExecutor(max_workers=10) as executor:
# Submit all tasks
future_to_url = {executor.submit(scrape_company_fast, link): link for link in company_links}
for i, future in enumerate(as_completed(future_to_url)):
handles = future.result()
if handles:
twitter_handles.extend(handles)
else:
companies_with_no_twitter += 1
# Optional: Print progress
if (i + 1) % 10 == 0:
print(f"Processed {i + 1}/{len(company_links)}")
# Remove duplicates from the final list while preserving order (optional)
twitter_handles = list(set(twitter_handles))
return twitter_handles, len(company_links), len(twitter_handles), companies_with_no_twitter, None
except Exception as e:
return [], 0, 0, 0, f"Unexpected error: {str(e)}"
finally:
# Ensure driver is closed if it wasn't closed earlier
try:
driver.quit()
except:
pass
def run_gradio(start_url):
result = scrape_twitter_urls(start_url)
twitter_handles, total_companies, total_twitter, no_twitter, error_msg = result
# Handle error case
if error_msg:
return f"<div style='border:1px solid #ff6b6b; padding:10px; color: #d63031;'><b>Error:</b> {error_msg}</div>"
clickable_links = "<br>".join([f'<a href="{url}" target="_blank">{url}</a>' for url in twitter_handles])
stats = f"""
<br><b>Stats:</b><br>
total companies found: {total_companies}<br>
founders' twitter handles found: {total_twitter}<br>
founders not on twitter: {no_twitter}
"""
return f"<div style='border:1px solid #ddd; padding:10px;'>{clickable_links}</div>{stats}"
gr.Interface(
fn=run_gradio,
inputs=gr.Textbox(label="Start URL"),
outputs=gr.HTML(label="Results"),
title="Find Twitter handles of YC founders",
).launch(share=True)