-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbs_reader.py
More file actions
48 lines (30 loc) · 1.01 KB
/
bs_reader.py
File metadata and controls
48 lines (30 loc) · 1.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import requests
import re
from bs4 import BeautifulSoup
def cup_of_soup(rss, addr):
result = rss.get(addr)
print("Connection state:", result.status_code)
bs = BeautifulSoup(result.text, 'lxml')
return bs
webaddr = "https://mofanpy.com/static/scraping/list.html"
webaddr_regex = "https://mofanpy.com/static/scraping/table.html"
s = requests.sessions.Session()
s.keep_alive = False
soup = cup_of_soup(s, webaddr)
# 所有anchor的href attributes
# 若是內容是anchor.string
for anchor in soup.find_all('a'):
print(anchor['href'])
for obj_month in soup.find_all(class_='month'):
print(obj_month.string)
# 找出january底下的list item
for jandate in soup.find(class_= 'jan').find_all('li'):
print(jandate.string)
soup = cup_of_soup(s, webaddr_regex)
# 找出格式為jpg的所有圖片
jpgs = soup.find_all('img', src=re.compile('.*\.jpg'))
print(jpgs)
# 找出隸屬於https://morvan下的超連結
hrefs = soup.find_all(href=re.compile('/.*'))
for i in hrefs:
print(i['href'])