-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml
More file actions
111 lines (102 loc) · 3.43 KB
/
config.yaml
File metadata and controls
111 lines (102 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Google Maps Scraper Configuration
# This file contains all configurable settings for the scraper
# Browser Settings
browser:
# Optional path to Chrome/Chromium executable (leave blank to auto-detect)
executable_path: ''
# Run browser in headless mode (true/false)
headless: false
# Navigation timeout in milliseconds
timeout_navigation: 60000
# Element wait timeout in milliseconds
timeout_element: 10000
# Short timeout for quick operations
timeout_short: 5000
# Very short timeout for fallback operations
timeout_very_short: 3000
# Handle Google cookie consent banner automatically
handle_cookie_banner: true
# Cookie preference: "reject" or "accept" (reject is recommended for privacy)
cookie_preference: "reject"
# Enable debug mode with screenshots and HTML saves
enable_debug_mode: true
# Scraping Behavior Settings
scraping:
# Time to wait between scroll operations (milliseconds)
scroll_interval: 1500
# Maximum scroll attempts when no new results found
max_scroll_attempts: 5
# Maximum listings to process per grid cell
max_listings_per_cell: 120
# Maximum reviews to extract per business
max_reviews_per_business: 100
# Number of reviews to process in each batch
review_batch_size: 10
# Default number of reviews if no limit specified
default_max_reviews: 50
# Default scraping mode: 'fast' (sequential) or 'coverage' (distributed)
default_mode: 'fast'
# Geographic Grid Settings
grid:
# Default grid size (creates grid_size x grid_size cells)
default_grid_size: 2
# Default zoom level for Google Maps
default_zoom_level: 12
# Default search bounds [min_lat, min_lng, max_lat, max_lng]
# Current default is Toronto area - adjust for your region
default_bounds: [43.6, -79.5, 43.9, -79.2]
# File and Output Settings
files:
# Filename for business data CSV
result_filename: 'result.csv'
# Filename for reviews CSV
reviews_filename: 'reviews.csv'
# Filename for progress tracking
progress_filename: 'scraper_progress.json'
# Log message format
log_format: '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
# Owner Enrichment Settings
owner_enrichment:
# Enable adaptive crawl + LLM owner extraction
enabled: false
# Crawl strategy controls
max_depth: 2
max_pages: 4
request_timeout_ms: 20000
query_terms:
- 'impressum'
- 'imprint'
- 'owner'
- 'contact'
- 'about us'
- 'ueber uns'
- 'geschaeftsfuehrer'
confidence_threshold: 0.75
saturation_threshold: 0.6
crawler_engine: 'adaptive'
crawler_browser_channel: null
# OpenRouter LLM defaults
openrouter_api_key_env: 'OPENROUTER_API_KEY'
openrouter_default_model: 'google/gemini-2.0-flash-exp:free'
# Compatibility flag; explicit model choice is always honored.
allow_free_models_only: true
max_llm_retries: 2
llm_response_format: 'json_object'
log_prompts: false
# Performance and Rate Limiting
# Note: Adjust these values based on your needs and to avoid being rate-limited
# Lower values are more respectful to Google's servers but slower
# Higher values are faster but may trigger anti-bot measures
# Example configurations for different use cases:
# Conservative (slow but safe):
# scraping:
# scroll_interval: 3000
# max_scroll_attempts: 3
# max_listings_per_cell: 50
# max_reviews_per_business: 25
# Aggressive (fast but may get blocked):
# scraping:
# scroll_interval: 500
# max_scroll_attempts: 10
# max_listings_per_cell: 200
# max_reviews_per_business: 200