-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
120 lines (93 loc) · 3.63 KB
/
scraper.py
File metadata and controls
120 lines (93 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import requests
import os
import argparse
from dotenv import dotenv_values
from bs4 import BeautifulSoup
from pexels_api import API
from pinscrape import pinscrape
class Scraper:
def __init__(self, key, lmt):
env_var = dotenv_values(".env")
self.pexels_api_key = env_var["PEXELS_API_KEY"]
self.key = key
self.limit = lmt
def get_parser():
parser = argparse.ArgumentParser(description="--- Web-image-scraper usage ---")
parser.add_argument("--kw", type=str, help="Keyword of scrape image")
parser.add_argument("--lm", type=int, default=1, help="Download limit")
return parser
# working, using beautiful soup to scrape and download images
def download_unsplash_images(self):
site = "Unsplash"
url = f"https://unsplash.com/s/photos/{self.key}/"
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
images = soup.find_all("img", {"class": "tB6UZ a5VGX"}, limit=self.limit)
images_links = [img.get("src") for img in images]
print("Downloading Unsplash images...")
for ind, link in enumerate(images_links):
if not os.path.exists(f"Download/"):
os.mkdir(f"Download/")
img = requests.get(link)
with open(
f"Download/from" + site + self.key + str(ind + 1) + ".jpg",
"wb",
) as file:
file.write(img.content)
print("Complete")
else:
print("responding error")
# working, using official api key and pexels_api module
def download_pexels_images(self, pages=1, img_per_page=1):
site = "Pexels"
api_key = self.pexels_api_key
api = API(api_key)
api.search(self.key, img_per_page, pages)
images_links = api.get_entries()
print("Downloading Pexels images...")
for ind, link in enumerate(images_links):
# if not os.path.exists(f"Download/From {site} of {key} Images"):
# os.mkdir(f"Download/From {site} of {key} Images")
if not os.path.exists(f"Download/"):
os.mkdir(f"Download/")
img = requests.get(link.original)
with open(
f"Download/from" + site + self.key + str(ind + 1) + ".jpg",
"wb",
) as file:
file.write(img.content)
print("Complete")
# working, using pinscrape to download images
def download_pinterest_images(self):
site = "Pinterest"
print("Downloading Pinterest images...")
pinscrape.scraper.scrape(
key=self.key,
output_folder=f"Download/",
max_images=self.limit,
)
print("Complete")
def main():
try:
# testing parser function
args = Scraper.get_parser().parse_args()
if args.kw:
key = args.kw
else:
key = input("Which type of images you want to download: ")
if args.lm:
download_limit = args.lm
else:
download_limit = int(input("How many images you want to download: "))
if download_limit > 0:
scraper = Scraper(key, download_limit)
scraper.download_unsplash_images()
scraper.download_pexels_images()
scraper.download_pinterest_images()
else:
raise ValueError("Download limit error")
except ValueError as e:
print(f"An error occurred: {e}. Please enter a valid number.")
if __name__ == "__main__":
main()