-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper_scrapeme.py
More file actions
111 lines (81 loc) · 3.27 KB
/
scraper_scrapeme.py
File metadata and controls
111 lines (81 loc) · 3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
Simple web scraper for scrapeme.live
Extracts product titles and prices across all pages.
"""
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs4
import time
import random
baseURL = "https://scrapeme.live/shop/"
def scrapeme(baseURL):
# Create a session (faster and more consistent requests)
session = requests.Session()
# Define headers to mimic a real browser
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/91.0.4472.124 Safari/537.36"
}
# Attach headers
session.headers.update(headers)
try:
response = session.get(baseURL, timeout=10)
response.raise_for_status()
print(f"Response Status Code: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"Error fetching the page: {e}")
return []
# Parse HTML
soup = bs4(response.text, "lxml")
# Get pagination
pagination = soup.find("ul", class_="page-numbers")
pagination_numbers = pagination.find_all("li") if pagination else []
list_of_numbers = []
for number in pagination_numbers:
text = number.text.strip()
if text.isdigit():
list_of_numbers.append(int(text))
max_page_number = max(list_of_numbers) if list_of_numbers else 1
print(f"Max page number: {max_page_number}")
products_info = []
# Loop pages
for page_number in range(1, max_page_number + 1):
print(f"\nScraping page {page_number}...")
url = baseURL if page_number == 1 else f"{baseURL}page/{page_number}/"
try:
response = session.get(url, timeout=10)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error on page {page_number}: {e}")
continue
soup = bs4(response.text, "lxml")
time.sleep(random.uniform(0.5, 2.0))
products = soup.find_all("li", class_="product")
for product in products:
titleTag = product.find("h2", class_="woocommerce-loop-product__title")
priceTag = product.find("span", class_="woocommerce-Price-amount")
imageTag = product.find("img", class_="attachment-woocommerce_thumbnail")
urlTag = product.find("a", class_ = "woocommerce-LoopProduct-link")
title = titleTag.text.strip() if titleTag else "No title"
price = priceTag.text.strip() if priceTag else "No price"
image = imageTag.get("src") if imageTag else "No image"
url = urlTag.get("href") if urlTag else "No URL"
data = {
"Title": title,
"Price": price,
"Image URL": image,
"Product URL": url
}
products_info.append(data)
print(f"Title: {title}, Price: {price}, Image URL: {image}, Product URL: {url}")
print(f"\nTotal products scraped: {len(products_info)}")
return products_info
# Entry point
if __name__ == "__main__":
data = scrapeme(baseURL)
# Convert to DataFrame
df = pd.DataFrame(data)
# Save CSV
df.to_csv("products_info.csv", index=False, encoding='utf-8-sig')
print("\nCSV saved successfully!")