forked from railbotan/multi-task-at-18
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget-links-Task.py
More file actions
18 lines (15 loc) · 812 Bytes
/
get-links-Task.py
File metadata and controls
18 lines (15 loc) · 812 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from urllib.request import urlopen
from urllib.parse import unquote
from bs4 import BeautifulSoup
from tqdm import tqdm
url = 'https://ru.wikipedia.org/wiki/%D0%A1%D0%BB%D1%83%D0%B6%D0%B5%D0%B1%D0%BD%D0%B0%D1%8F:%D0%A1%D0%BB%D1%83%D1%87%D0%B0%D0%B9%D0%BD%D0%B0%D1%8F_%D1%81%D1%82%D1%80%D0%B0%D0%BD%D0%B8%D1%86%D0%B0'
res = open('urls.txt', 'w', encoding='utf8')
#Количество страниц было уменьшено, так как занимало большое количество времени
for i in tqdm(range(20)):
html = urlopen(url).read().decode('utf8')
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a')
for l in links:
href = l.get('href')
if href and href.startswith('http') and 'wiki' not in href:
print(href, file=res)