Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*.dll
*.so
*.dylib
enscan

.idea
# Test binary, built with `go test -c`
Expand Down
89 changes: 89 additions & 0 deletions common/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,92 @@ func ExtractPortString(rawURL string) (string, error) {

return port, nil // port 已经是字符串
}

// DeduplicateMapList 对map列表进行去重
// dataType: 数据类型,用于确定去重的关键字段
// data: 待去重的数据列表
func DeduplicateMapList(dataType string, data []map[string]string) []map[string]string {
if len(data) == 0 {
return data
}

// 定义不同数据类型的唯一标识字段组合
var keyFields []string
switch dataType {
case "enterprise_info", "invest", "branch", "holds", "supplier", "partner":
// 企业相关数据使用 PID 作为唯一标识
keyFields = []string{"pid"}
case "icp":
// ICP备案使用域名+备案号作为唯一标识
keyFields = []string{"domain", "icp"}
case "app":
// APP使用名称+Bundle ID作为唯一标识
keyFields = []string{"name", "bundle_id"}
case "wx_app", "wechat":
// 微信小程序和公众号使用名称作为唯一标识
keyFields = []string{"name"}
case "weibo":
// 微博使用链接作为唯一标识
keyFields = []string{"profile_url"}
case "job":
// 招聘信息使用职位名称+发布日期+地点作为唯一标识
keyFields = []string{"name", "publish_time", "location"}
case "copyright":
// 软件著作权使用登记号作为唯一标识
keyFields = []string{"reg_num"}
default:
// 默认使用名称作为唯一标识
keyFields = []string{"name"}
}

// 使用map记录已经出现过的记录
seen := make(map[string]bool)
result := make([]map[string]string, 0, len(data))

for _, item := range data {
// 生成唯一键
var keyParts []string
allEmpty := true
for _, field := range keyFields {
value := item[field]
keyParts = append(keyParts, value)
if value != "" {
allEmpty = false
}
}

// 如果所有关键字段都为空,跳过该记录
if allEmpty {
continue
}

key := strings.Join(keyParts, "|")
if !seen[key] {
seen[key] = true
result = append(result, item)
}
}

return result
}

// DeduplicateData 对整个数据集进行去重
func DeduplicateData(data map[string][]map[string]string) map[string][]map[string]string {
result := make(map[string][]map[string]string)
totalOriginal := 0
totalDeduplicated := 0

for dataType, items := range data {
originalCount := len(items)
totalOriginal += originalCount
result[dataType] = DeduplicateMapList(dataType, items)
totalDeduplicated += len(result[dataType])
}

totalRemoved := totalOriginal - totalDeduplicated
if totalRemoved > 0 {
gologger.Info().Msgf("数据去重完成: 原始 %d 条,去重后 %d 条,移除重复 %d 条", totalOriginal, totalDeduplicated, totalRemoved)
}

return result
}
Loading