forked from jeffrey-yc/Homework
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
73 lines (54 loc) · 1.9 KB
/
main.py
File metadata and controls
73 lines (54 loc) · 1.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#-------------------------------------------
# 匯入必要模組
#-------------------------------------------
from selenium import webdriver
from html.parser import HTMLParser
#-------------------------------------------
# 定義一個HTML解譯類別
#-------------------------------------------
class MyHTMLParser(HTMLParser):
content=''
print=False
def handle_data(self, data):
if data.strip()=='驚奇4超人':
self.print=True
if '期待度' in data.strip():
self.print=False
if data.strip()=='劇情介紹':
self.print=True
if '展開劇情簡介' in data.strip():
self.print=False
if self.print:
self.content+=data
def get_content(self):
return self.content
#-------------------------------------------
# 載入Chrome驅動程式
#-------------------------------------------
driver = webdriver.Chrome("chromedriver.exe")
#-------------------------------------------
# 待拜訪的網址
#-------------------------------------------
urls=[
'https://movies.yahoo.com.tw/movieinfo_main.html/id=5644'
]
#-------------------------------------------
# 依序將範例網址交給瀏覽器
#-------------------------------------------
for url in urls:
driver.get(url)
# 取得網頁原始碼
with open('out.txt', 'w', encoding='utf-8') as outfile:
pageSource = driver.page_source
#-------------------------------------------
# 取出沒有標籤的內容
#-------------------------------------------
parser = MyHTMLParser()
parser.feed(pageSource)
content=parser.get_content()
print(content)
outfile.write(content)
#-------------------------------------------
# 關閉Chrome驅動程式
#-------------------------------------------
driver.close()