-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdcard(舊).py
More file actions
118 lines (96 loc) · 3.53 KB
/
dcard(舊).py
File metadata and controls
118 lines (96 loc) · 3.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import random
import re
from bs4 import BeautifulSoup as bs
import requests as req
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager # 自動更新webdriver pip install webdriver-manager
import time
from fake_useragent import UserAgent
# search 找尋的餐廳品牌 year 從現在到year資料全爬
def getDcardInfo(search, year):
options = Options()
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get("https://www.dcard.tw/search?forum=food&query={}".format(search))
dataList = []
status = True
while status:
# 循环将滚动条下拉
driver.execute_script("window.scrollBy(0,1000)")
# sleep一下让滚动条反应一下
html = driver.page_source
soup = bs(html, "lxml")
time.sleep(random.randint(2, 4))
# 一個block
data = soup.find_all("article", class_="sc-fc3be524-0 gFlPCQ")
for item in data:
dic = {}
link = "https://www.dcard.tw" + item.find("a", class_="sc-fc3be524-3 fpKIeR").get("href")
date = item.find_all("div", class_="sc-1b1fbd22-3 gqlsdQ")[-1].text
# 組合成一個title
title = "".join([t.text for t in item.find("a", class_="sc-fc3be524-3 fpKIeR")])
dic["title"] = title
dic["date"] = date
dic["link"] = link
dataList.append(dic)
# 日期達到才停止
result = re.findall("\d{4}", date)
if result !=[] and int(result[0]) <= int(year):
status = False
break
driver.quit()
# try:
#
# except:
# pass
# if int(year) >= int([0]):
# status = False
# break
# 最後一筆為前年
return dataList[:-1]
def clearData(dataList):
# 找出重複的
ClearData = []
titlelist = []
for i, item in enumerate(dataList):
title = item["title"]
if title not in titlelist:
ClearData.append(item)
titlelist.append(title)
with open("dcard(時時香).json", "w", encoding="utf-8") as f:
json.dump(ClearData, f, ensure_ascii=False, indent=1)
datalist = getDcardInfo("時時香", '2020') #輸入想要找尋的品牌及"年度"以前的文章
clearData(datalist)
textlist=[]
# def linkClick(dic):
with open("dcard(時時香).json", "r", encoding="utf-8") as file:
datas = json.load(file)
link_list = []
for l in datas:
link_list.append(l["link"])
# print(len(link_list))
ua = UserAgent()
content_list = []
# a = 45
# while a < 53 : #0~53
# a+=1
for link in link_list:
res = req.get(link)
soup = bs(res.text, "lxml")
time.sleep(random.uniform(2, 6))
title = soup.find("h1",class_="sc-ae7e8d73-0 wYxxj").text
time.sleep(random.randint(40 ,60))
try:
content = soup.find_all("div", class_="sc-ebb1bedf-0 aiaXw")[0].find_all("span")
for i in content:
textlist.append(i.text.strip().replace("\n",''))
except:
pass
dic = {}
dic["title"] = title
dic["content"] = "".join(textlist)
content_list.append(dic)
textlist = []
with open("dcard-content(時時香).json", "w", encoding="utf-8") as f:
f.write(json.dumps(content_list, ensure_ascii=False, indent=1)[1:-1])