Skip to content

Commit 9c030fb

Browse files
committed
templates
1 parent 4d3b3c6 commit 9c030fb

1 file changed

Lines changed: 152 additions & 3 deletions

File tree

templates/apnnews.go

Lines changed: 152 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,167 @@
11
package templates
22

33
import (
4+
"encoding/json"
5+
"log"
6+
7+
//"log"
8+
"strings"
9+
"time"
410

511
"github.com/PuerkitoBio/goquery"
612
)
713

14+
type ApNewsMetaData []struct {
15+
Context string `json:"@context"`
16+
Type string `json:"@type"`
17+
URL string `json:"url"`
18+
DateModified time.Time `json:"dateModified"`
19+
DatePublished time.Time `json:"datePublished"`
20+
Description string `json:"description"`
21+
Image []struct {
22+
Context string `json:"@context"`
23+
Type string `json:"@type"`
24+
Height int `json:"height"`
25+
ThumbnailURL string `json:"thumbnailUrl"`
26+
URL string `json:"url"`
27+
Width int `json:"width"`
28+
} `json:"image"`
29+
MainEntityOfPage struct {
30+
Type string `json:"@type"`
31+
ID string `json:"@id"`
32+
} `json:"mainEntityOfPage"`
33+
Author []struct {
34+
Context string `json:"@context"`
35+
Type string `json:"@type"`
36+
Name string `json:"name"`
37+
} `json:"author"`
38+
Publisher struct {
39+
Type string `json:"@type"`
40+
Name string `json:"name"`
41+
Logo struct {
42+
Type string `json:"@type"`
43+
URL string `json:"url"`
44+
} `json:"logo"`
45+
} `json:"publisher"`
46+
ArticleSection []string `json:"articleSection"`
47+
Keywords []string `json:"keywords"`
48+
ThumbnailURL string `json:"thumbnailUrl"`
49+
Name string `json:"name"`
50+
Headline string `json:"headline"`
51+
}
52+
53+
func (t *Template) ApNewsCommonGetPublishedAtTimestamp(document *goquery.Document) int64 {
54+
var publishedAt int64 = 0
55+
publishedAt = t.CommonGetPublishedAtTimestampSingleJson(document)
56+
if publishedAt == 0 {
57+
publishedAt = t.ApNewsPublishedAtTimeFromScriptMetadata(document)
58+
}
59+
return publishedAt
60+
61+
}
62+
63+
func (t *Template) ApNewsPublishedAtTimeFromScriptMetadata(document *goquery.Document) int64 {
64+
var publishedAt int64 = 0
65+
scriptSelectorFirst := "head > script[type=\"application/ld+json\"]"
66+
scriptSelectorSecond := "body > script[type=\"application/ld+json\"]"
67+
scriptSelectorThird := "script[type=\"application/ld+json\"]"
68+
69+
scriptSelectorList := make([]string, 100)
70+
scriptSelectorList = append(scriptSelectorList, scriptSelectorFirst)
71+
scriptSelectorList = append(scriptSelectorList, scriptSelectorSecond)
72+
scriptSelectorList = append(scriptSelectorList, scriptSelectorThird)
73+
74+
for _, scriptSelector := range scriptSelectorList {
75+
76+
document.Find(scriptSelector).Each(func(i int, s *goquery.Selection) {
77+
if publishedAt != 0 {
78+
return
79+
}
80+
scriptContent := strings.TrimSpace(s.Text())
81+
var firstTypeMetaData ApNewsMetaData
82+
unmarshalErr := json.Unmarshal([]byte(scriptContent), &firstTypeMetaData)
83+
if unmarshalErr != nil {
84+
log.Printf("convert ApNewsMetaData unmarshalError %v", unmarshalErr)
85+
return
86+
87+
}
88+
firstElement := firstTypeMetaData[0]
89+
publishedAt = firstElement.DateModified.Unix()
90+
91+
// var jsonMap map[string]interface{}
92+
93+
// logger.Info("script content %s author length %d",scriptContent, len(currentDWMetadata.Author))
94+
})
95+
if publishedAt != 0 {
96+
break
97+
}
98+
}
99+
return publishedAt
100+
101+
}
102+
103+
func (t *Template) ApNewsAuthorExtractFromScriptMetadata(document *goquery.Document) (string, string) {
104+
105+
author := ""
106+
published_at := ""
107+
scriptSelectorFirst := "head > script[type=\"application/ld+json\"]"
108+
scriptSelectorSecond := "body > script[type=\"application/ld+json\"]"
109+
scriptSelectorThird := "script[type=\"application/ld+json\"]"
110+
111+
scriptSelectorList := make([]string, 100)
112+
scriptSelectorList = append(scriptSelectorList, scriptSelectorFirst)
113+
scriptSelectorList = append(scriptSelectorList, scriptSelectorSecond)
114+
scriptSelectorList = append(scriptSelectorList, scriptSelectorThird)
115+
116+
for _, scriptSelector := range scriptSelectorList {
117+
118+
document.Find(scriptSelector).Each(func(i int, s *goquery.Selection) {
119+
if author != "" {
120+
return
121+
}
122+
scriptContent := strings.TrimSpace(s.Text())
123+
var firstTypeMetaData ApNewsMetaData
124+
unmarshalErr := json.Unmarshal([]byte(scriptContent), &firstTypeMetaData)
125+
if unmarshalErr != nil {
126+
log.Printf("convert ApNewsMetaData unmarshalError %v", unmarshalErr)
127+
return
128+
129+
}
130+
firstElement := firstTypeMetaData[0]
131+
for currentIndex, currentAuthor := range firstElement.Author {
132+
133+
if currentIndex != 0 {
134+
author += " & "
135+
}
136+
log.Printf("author Name: ", currentAuthor.Name)
137+
author += currentAuthor.Name
138+
}
139+
140+
// var jsonMap map[string]interface{}
141+
142+
// logger.Info("script content %s author length %d",scriptContent, len(currentDWMetadata.Author))
143+
})
144+
if author != "" {
145+
break
146+
}
147+
}
148+
log.Printf("author last: %s", author)
149+
return author, published_at
150+
}
151+
8152
func (t *Template) ApnNewsScrapMetaData(document *goquery.Document) (string, string) {
9153
author := ""
10154
published_at := ""
11-
author,published_at= t.AuthorExtractFromScriptMetadata(document)
155+
author, published_at = t.AuthorExtractFromScriptMetadata(document)
156+
if author == "" {
157+
author, published_at = t.AuthorExtractFromScriptMetadata(document)
158+
}
159+
if author == "" {
160+
author, published_at = t.ApNewsAuthorExtractFromScriptMetadata(document)
161+
}
12162
if author == "" {
13163
author = "AP News"
14-
}
164+
}
15165

16166
return author, published_at
17167
}
18-

0 commit comments

Comments
 (0)