-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
186 lines (144 loc) · 6.14 KB
/
Copy pathscraper.py
File metadata and controls
186 lines (144 loc) · 6.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
Costco Korea product scraper.
Usage: python scraper.py <url> [--output products.json]
"""
import asyncio
import argparse
import re
import sys
from datetime import date
from urllib.parse import urlparse
from playwright.async_api import async_playwright, Page, TimeoutError as PWTimeout
from models import Product
from parsers import parse_name_quantity, parse_price, parse_rating_aria
from storage import save_json, save_csv, save_sql, sql_path_from
BASE_URL = "https://www.costco.co.kr"
def category_from_url(url: str) -> str:
parts = urlparse(url).path.rstrip("/").split("/")
try:
idx = parts.index("c")
category_parts = [p for p in parts[1:idx] if p]
except ValueError:
category_parts = [p for p in parts if p]
return "_".join(category_parts) if category_parts else "products"
def default_output_path(url: str, ext: str = "json") -> str:
import os
category = category_from_url(url)
folder = os.path.join("products", date.today().strftime("%Y-%m-%d"))
os.makedirs(folder, exist_ok=True)
return os.path.join(folder, f"{category}.{ext}")
def to_absolute(url: str | None, base: str = BASE_URL) -> str | None:
if not url:
return None
return url if url.startswith("http") else base + url
async def scroll_to_load_all(page: Page):
prev_height = 0
for _ in range(20):
height = await page.evaluate("document.body.scrollHeight")
if height == prev_height:
break
prev_height = height
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(1200)
async def scrape_page(page: Page) -> list[Product]:
await scroll_to_load_all(page)
containers = await page.query_selector_all("li.product-list-item")
if not containers:
containers = await page.query_selector_all("li[class*='product']")
print(f" 상품 컨테이너: {len(containers)}개")
products = []
for container in containers:
name_el = await container.query_selector("a.lister-name .notranslate")
if not name_el:
name_el = await container.query_selector("a.lister-name")
if not name_el:
continue
raw_name = (await name_el.inner_text()).strip()
if not raw_name:
continue
name, quantity = parse_name_quantity(raw_name)
price_el = await container.query_selector(".original-price .product-price-amount")
price = parse_price((await price_el.inner_text()).strip()) if price_el else None
rating_el = await container.query_selector(".star-ratings-css[aria-label]")
aria_label = await rating_el.get_attribute("aria-label") if rating_el else None
rating, review_count = parse_rating_aria(aria_label)
img_url = None
webp_el = await container.query_selector("picture source[type='image/webp']")
if webp_el:
img_url = to_absolute(await webp_el.get_attribute("srcset"))
if not img_url:
img_el = await container.query_selector("picture img")
if img_el:
img_url = to_absolute(await img_el.get_attribute("src"))
link_el = await container.query_selector("a.lister-name")
href = await link_el.get_attribute("href") if link_el else None
products.append(Product(
name=name,
quantity=quantity,
price=price,
rating=rating,
review_count=review_count,
image_url=img_url,
product_url=to_absolute(href) or "",
))
return products
def _page_url(base_url: str, page_num: int) -> str:
if page_num == 1:
return base_url
stripped = re.sub(r"[?&]page=\d+", "", base_url).rstrip("?&")
sep = "&" if "?" in stripped else "?"
return f"{stripped}{sep}page={page_num - 1}"
async def scrape_url(url: str) -> list[Product]:
print(f"스크래핑: {url}")
async with async_playwright() as pw:
browser = await pw.chromium.launch(headless=True)
context = await browser.new_context(
user_agent=(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
locale="ko-KR",
)
page = await context.new_page()
all_products: list[Product] = []
page_num = 1
while True:
page_url = _page_url(url, page_num)
print(f" 페이지 {page_num}: {page_url}")
try:
await page.goto(page_url, wait_until="networkidle", timeout=30000)
except PWTimeout:
print(" networkidle 타임아웃 — 현재 상태로 진행")
products = await scrape_page(page)
if not products:
print(" 상품 없음 — 페이지네이션 종료")
break
all_products.extend(products)
print(f" {len(products)}개 수집 (누적: {len(all_products)})")
page_num += 1
await browser.close()
return all_products
def main():
parser = argparse.ArgumentParser(description="Costco Korea 상품 스크래퍼")
parser.add_argument("url", help="Costco Korea 카테고리 URL")
parser.add_argument("--output", "-o", default=None, help="출력 경로 (.json 또는 .csv)")
args = parser.parse_args()
products = asyncio.run(scrape_url(args.url))
if not products:
print("상품을 찾을 수 없습니다. 페이지 구조가 변경됐을 수 있습니다.")
sys.exit(1)
print(f"\n총 {len(products)}개")
output_path = args.output or default_output_path(args.url, "json")
if output_path.endswith(".csv"):
save_csv(products, output_path)
else:
save_json(products, output_path)
save_sql(products, sql_path_from(output_path))
print("\n--- 샘플 (상위 5개) ---")
for p in products[:5]:
rating_str = f"{p.rating} ({p.review_count}건)" if p.rating else "평점없음"
price_str = f"{p.price:,}" if p.price else "N/A"
print(f" {p.name[:35]:<35} qty={p.quantity} {price_str:<10} {rating_str}")
if __name__ == "__main__":
main()