-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathuseragents.dat
More file actions
296 lines (283 loc) · 15.3 KB
/
useragents.dat
File metadata and controls
296 lines (283 loc) · 15.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# First word in line must be or exclude. Remainder of line, after whitespace is a regex.
# Regex must match the entire user agents string.
# Lines starting with # (optionally after whitespace) are ignored. Blank lines are ignored.
# Include by default.
# Example:
# exclude AhrefsBot/.*
#$ over 100,000 hits/day
## Most hits.
exclude .* MJ12bot/.*; http://mj12bot\.com/\)
exclude .*[ ;]PetalBot;.*
## Generic rule for anything identifying itself as a crawler. Avoid adding anything else that matches this.
exclude .*[Cc]rawler.*
exclude Mozilla/5\.0 \(compatible; SEOkicks; \+https://www\.seokicks\.de/robot\.html\)
## This one appears to be a bit indecisive, using the undecorated string for a few and then
## some variants 100,000 times
exclude .*AspiegelBot.*
exclude Aspiegel[bB]ot
#$ 100,000 hits/day
## Chinese bots. The next four appear to work in concert.
exclude .* LieBaoFast/.*
exclude .* Mb2345Browser/.*
## These hide behind valid Chinese software (WeChat, UCBrowser), so block the specific UA for now
exclude Mozilla/5\.0 \(Linux; Android 7\.0; FRD-AL00 Build/HUAWEIFRD-AL00; wv\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Version/4\.0 Chrome/53\.0\.2785\.49 Mobile MQQBrowser/6\.2 TBS/043602 Safari/537\.36 MicroMessenger/6\.5\.16\.1120 NetType/WIFI Language/zh_CN
exclude Mozilla/5\.0\(Linux;U;Android 5\.1\.1;zh-CN;OPPO A33 Build/LMY47V\) AppleWebKit/537\.36\(KHTML,like Gecko\) Version/4\.0 Chrome/40\.0\.2214\.89 UCBrowser/11\.7\.0\.953 Mobile Safari/537\.36
#$ 50,000 hits/day
exclude .* \(Amazonbot/.*; \+https://developer\.amazon\.com/support/amazonbot\)
## Distributed crawler hiding behind a randomized combination of old user agent pieces. Probing
## for vulnerabilities.
exclude Mozilla/5\.0 \(Macintosh; Intel Mac OS X 10\.(6|7|8|9|10); rv:(37|38|43|68)\.0\) Gecko/20100101 Firefox/(37|38|43|68)\.0
exclude Mozilla/5\.0 \(Windows NT (6\.0|6\.1|6\.2|6\.3|10\.0); (WOW64; |Win64; x64; |)rv:(28|37|38|43|50|51|56|68|76)\.0\) Gecko/20100101 Firefox/(28|37|38|43|50|51|56|68|76)\.0
exclude Mozilla/5\.0 \(Windows NT 6\.1; WOW64\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/34\.0\.1847\.131 Safari/537\.36
exclude Mozilla/5\.0 \(Windows; U; Windows ?NT (5|6)\.1; en-(US|GB); rv:(1\.8\.1\.6|1\.9\.2|1\.9\.2\.12)\) Gecko/(20100115|20070725|20101026) Firefox/(2\.0\.0\.6|3\.6|3\.6\.12)
exclude Mozilla/5\.0 \(Windows NT 6\.[01]; Win64; x64; rv:(50|73)\.0\) Gecko/20100101 Firefox/(50|73)\.0
exclude Mozilla/5\.0 \(Windows NT 6\.(1|3); Win64; x64\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/(68\.0\.3440\.7|73\.0\.3683\.86) Safari/537\.36
## 2020-03-20
exclude Molokai/0\.2 \(\+http://www\.example\.com/bot\.html\)
## 2021-12
exclude MolokaiBot \(molokaibot@gmail.com\)
## 2020-05
exclude Mozilla/4\.0 \(compatible; MSIE 6\.0; Windows NT 5\.1; SV1; \.NET CLR 2\.0\.50727; \.NET CLR 1\.1\.4322\)
## 2020-08
exclude Pcore-HTTP/.*
## 2021-06
exclude .* BoldBrains SC/.*
## 2022-12
exclude Timpibot/.* \(\+http://www\.timpi\.io\)
#$ 25,000 hits/day
## This appears to be the most recent variant on their UA.
exclude CCBot/.* \(https://commoncrawl\.org/faq/\)
exclude Mozilla/5\.0 \(compatible; YandexBot/.*; \+http://yandex\.com/bots\).*
exclude PiplBot.*\(\+http://www\.pipl\.com/bot/\)
exclude .* DotBot/.*; \+?https?://(www\.|)opensiteexplorer\.org/dotbot[,;] help@moz\.com\)
## Bot(net?) hiding behind an old user agent; so far a one off, hit hard on one day
exclude Mozilla/4\.0 \(compatible; MSIE [67]\.0; Windows NT (6\.0;|5\.1; SV1)\)
## 2020-06
exclude Rustbot/0\.3
## 2020-12
exclude Mozilla/5\.0 \(Macintosh; Intel Mac OS X 10_15_7\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/86\.0\.4240\.111 Safari/537\.36
## 2020-12-22 through 26 only, crawling one site
exclude Mozilla/5\.0 \(Windows NT 10\.0; Win64; x64\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/75\.0\.3729\.169 Safari/537\.36
#$ 10,000 hits/day
exclude serpstatbot/.*
## Generic rule for anything identifying itself as a spider. Avoid adding anything else that matches this.
exclude .*[Ss]pider.*
## The Splash scraper: https://github.com/scrapinghub/splash - fakes UAs but with its own signature
exclude .* splash Version/.* .*
## 2020-07-20
exclude .* \+centuryb\.o\.t9\[at\]gmail\.com.*
## 2020-11 Attempted SQL injection attack via UA, malicious SQL comes after UA or in fragments
exclude Mozilla/5\.0 \(X11; U; Linux x86_64; en-US\) AppleWebKit/533\.4 \(KHTML, like Gecko\) Chrome/5\.0\.375\.99 Safari/533\.4.*
## Part of above, involves a number of 1234=1234 type clauses to force matches
exclude .*(ORDER BY|MAKE_SET|CASE WHEN|AS .... WHERE|OR ....=....|\(....=....|\(SELECT |RDB\$DATABASE).*
## 2020-12
exclude Archive Team
## 2020-12-17 only, crawling one site
#exclude Mozilla/5\.0 \(Windows NT 10\.0; Win64; x64\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/87\.0\.4280\.66 Safari/537\.36
## 2020-03 ruining a brand-new agent as soon as it's in the wild
exclude Mozilla/5\.0 \(Macintosh; Intel Mac OS X 11\.1; rv:84\.0\) Gecko/20100101 Firefox/84\.0
exclude Mozilla/5.0 \(Windows NT 10\.0; Win64; x64\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/87\.0\.4280\.66 Safari/537\.36
## 2021-11 similar to above
exclude ArchiveTeam.*
## 2022-01
exclude HonoluluBot
#$ 2,000 hits/day
exclude .* Googlebot/.*; \+http://www\.google\.com/bot.html\)
exclude Googlebot/.* \(\+http://www\.google\.com/bot.html\)
exclude .* [Bb]ingbot/.*; \+http://www\.bing\.com/bingbot\.html?\).*
## These four appear to act in concert. In that regard they appear to be similar
## to the other set of four out of China, but with less regional bits.
#Mozilla/5.0(Linux;Android 5.1.1;OPPO A33 Build/LMY47V;wv) AppleWebKit/537.36(KHTML,link Gecko) Version/4.0 Chrome/43.0.2357.121 Mobile Safari/537.36
#Mozilla/5.0(Linux;Android 5.1.1;OPPO A33 Build/LMY47V;wv) AppleWebKit/537.36(KHTML,link Gecko) Version/4.0 Chrome/42.0.2311.138 Mobile Safari/537.36
# link Gecko
exclude .* AppleWebKit/537\.36\(KHTML,link Gecko\).*
exclude Mozilla/5\.0\(Linux;U;Android 5\.1\.1;zh-CN;OPPO A33 Build/LMY47V\) AppleWebKit/537\.36\(KHTML,like Gecko\) Version/4\.0 Chrome/40\.0\.2214\.89 Mobile Safari/537\.36
exclude Mozilla/5\.0 \(iPad; CPU OS 7_0 like Mac OS X\) AppleWebKit/537\.51\.1 \(KHTML, like Gecko\) Version/7\.0 Mobile/11A465 Safari/9537\.53
## 2020-04-14, incomplete fake Windows 10 with no other UA info
exclude Mozilla/5\.0 \(Windows NT 10\.0; Win64; x64\)
## 2020-07 to 08 - fake older OSes with much older Chrome; runs multiple fake agents each
## day, meaning we'll have to block old Chromes of this style universally - likely these
## are mismatched to the WebKit version though
exclude Mozilla/5\.0 \(.*\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/(2[789]|3[01234567]|4[01])\..* Safari/(537\.36|4E423F).*
## 2021-04
exclude GoodBot
exclude .* Neevabot/1\.0; \+https://neeva\.com/neevabot\)
## 2021-05 - fake older OS and browser
exclude Mozilla/5\.0 \(Macintosh; Intel Mac OS X 10_12_6\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/60\.0\.3112\.90 Safari/537\.36
## 2021-08
exclude Arctic Wolf Scan
## This is likely a valid user agent but is being abused by a misbehaved bot, possibly a botnet based on IPs it uses
exclude Mozilla/[45]\.0 \(compatible; MSIE 8\.0; Windows NT 5\.[01]; Trident/4\.0; .*
exclude Mozilla/4\.0 \(compatible; MSIE 8\.0; Windows NT 6\.0; Trident/4\.0\)
## This is an older user agent possibly being abused by a bot. 2020-05
exclude Mozilla/5\.0 \(Windows NT 10\.0; Win64; x64\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/63\.0\.3239\.84 Safari/537\.36
## 2020-11
exclude Mozilla/5\.0 \(Windows NT 10\.0; Win64; x64\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/78\.0\.3904\.108 Safari/537\.36
#$ 1,000 hits/day
## Yes, this is the entire UA.
exclude Blackboard Safeassign
exclude .* Daum/.*; \+http://cs\.daum\.net.*
exclude Mozilla/5\.0 \(\(Windows; U; Windows NT 6\.1; fr; rv:1\.9\.2\) Gecko/20100115 Firefox/3\.6\)
## Obvious crawler hiding behind a very recent UA, remove as soon as it stops.
exclude Mozilla/5\.0 \(Windows NT 10\.0; Win64; x64\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/74\.0\.3729\.169 Safari/537\.36
## 2020-07
exclude Mozilla/5\.0 \(X11; U; Linux x86_64; en-US; rv:1\.7\.12\) Gecko/20060202 CentOS/1\.0\.7\-1\.4\.3\.centos4 Firefox/1\.0\.7.*
## 2020-11
exclude Mozilla/5\.0 \(X11; Linux x86_64\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/80\.0\.3987\.0 Safari/537\.36
## 2020-12 SQL attacking in URL and UA
exclude Mozilla/5\.0 \(Windows; U; Windows NT 6\.0; es-es\) AppleWebKit/528\.16 \(KHTML, like Gecko\) Version/4\.0 Safari/528\.16.*
## 2020-12-14 only, crawling one site
#exclude Mozilla/5\.0 \(iPhone; CPU iPhone OS 10_3_1 like Mac OS X\) AppleWebKit/603\.1\.30 \(KHTML, like Gecko\) Version/10\.0 Mobile/14E304 Safari/602\.1
## 2021-10
exclude Mozilla/5\.0 \(Windows NT 6\.3; Trident/7\.0; rv:11\.0\) like Gecko
## 2022-09
exclude .* SeekportBot; \+https://bot\.seekport\.com\)
#$ 500 hits/day
## Yes, this is the entire UA.
exclude ia_archiver
exclude python-requests/.*
exclude .* Qwantify/.*; \+https://www\.qwant\.com/\).*
## Yes, this is the entire UA.
exclude The Knowledge AI
## Someone working from the command line.
exclude .* HeadlessChrome/.*
## 2020-02-14
exclude Mozilla/5\.0 \(X11; Linux x86_64\) AppleWebKit/537\.36 \(KHTML, like Gecko\) Chrome/74\.0\.3729\.157 Safari/537\.36
## 2020-03-20, unbelievable Windows 7 renaissance
exclude Mozilla/5\.0 \(Windows NT 6\.1\) AppleWebKit/.* \(KHTML, like Gecko\) Chrome/.* Safari/.*
## 2020-04-14
exclude Re-re Studio \(\+http://2re\.site/\)
exclude CcahiBot \(ccahi\.feedback\@gmail\.com\)
## 2020-05
exclude Mozilla/5\.0 \(Windows NT 6\.(1|2); (WOW64; )?Trident/7\.0; rv:11\.0\) like Gecko
## 2020-06
exclude Turnitin \(https://bit\.ly/2UvnfoQ\)
exclude Mozilla/4\.0 \(compatible; MSIE 7\.0; Windows NT 5\.1; \.NET CLR 2\.0\.50727; \.NET CLR 1\.1\.4322\)
exclude AccompanyBot
## 2020-07
exclude BananaBot/.*
## 2020-08
exclude Scrapy/.* \(\+https://scrapy\.org\)
## 2020-12
exclude Mozilla/4\.0 \(compatible; MSIE 7\.0; Windows NT 5\.0\) SiteCheck-sitecrawl by Siteimprove\.com
## 2020-12
exclude .* SiteCheck\-sitecrawl by Siteimprove\.com
## Crawling .edu sites in alphabetical order hidden behind a slightly older Mac Firefox
exclude Mozilla/5\.0 \(Macintosh; Intel Mac OS X 10\.14; rv:68\.0\) Gecko/20100101 Firefox/68\.0
## 2021-02
exclude Re\-re Studio \(\+http://vip0\.ru/\)
exclude .* TUDelftNetworkMeasurement/2020 \(contact; t\.fiebig; at; tudelft\.nl; s\.r\.g\.pletinckx; at; student\.tudelft\.nl\)
## 2021-03
exclude User-Agent: Mozilla/5\.0 .*
## 2021-05
exclude httpx - Open-source project \(github\.com/projectdiscovery/httpx\)
## 2021-06
exclude .* InfoTigerBot/.*; \+https://infotiger\.com/bot\)
exclude curl/.*
## 2021-08
exclude subjs
## 2021-09
exclude Riddler \(http://riddler\.io/about\)
## 2021-12
exclude newspaper/.*
## 2022-07
exclude .* Mail\.RU_Bot/2\.0; \+https://help\.mail\.ru/webmaster/indexing/robots\)
#$ 100 hits/day
exclude Python/.* aiohttp/.*
exclude Linguee Bot \(http://www\.linguee\.com/bot; bot@linguee\.com\)
exclude eContext/1\.0 \(eContext Classification Engine\)
## Super old Ubuntu visits hundreds of times per day? Doubtful.
exclude Mozilla/5\.0 \(X11; U; Linux i686; en-GB; rv:1\.8\.1\.10\) Gecko/20071126 Ubuntu/7\.10 \(gutsy\) Firefox/2\.0\.0\.10
exclude Mozilla/5\.0 \(X11; Linux x86_64\) AppleWebKit/535\.11 \(KHTML, like Gecko\) Ubuntu/11\.10 Chromium/17\.0\.963\.65 Chrome/17\.0\.963\.65 Safari/535\.11\)
## Also seems to have had a problem with proper escaping
exclude Mozilla/5\.0 \(X11; Linux x86_64\) AppleWebKit/535\.11 \(KHTML, like Gecko\) Ubuntu/11\.10 Chromium/17\.0\.963\.65 Chrome/17\.0\.963\.65 Safari/535\.11\\x22\'\`\-\-
## 2020-07
exclude Mozilla/5\.0 \(compatible; heritrix/.* \+http://jiransecurity\.com\)
## 2020-09
exclude ReachabilityCheckBot/0\.2
exclude Mozilla/4\.0 \(compatible; MSIE 6\.0b; Windows NT 5\.1; DigExt\)
## 2020-12
exclude Mozilla/5\.0 \(compatible; Domains Project/1\.2\.9; \+https://domainsproject\.org\)
exclude Mozilla/5\.0 \(compatible; Adsbot/3\.1\)
exclude Mozilla/5\.0 \(compatible; Adsbot/3\.1\; \+https://seostar\.co/robot/\)
exclude HTMLParser/2\.0
## The entirety of one bogus UA.
exclude Mozilla/4\.0
## Fake Facebook bot, missing the first letter
exclude acebookexternalhit/1\.0 \(\+http://www\.facebook\.com/externalhit_uatext\.php\)
exclude Keybot Translation-Search-Machine
exclude 7Siters/.* \(\+https://7ooo\.ru/siters/\)
## 2020-04-14
exclude Gigabot
## 2020-05
exclude Mozilla/4\.0 \(compatible; MSIE 8\.0; Windows NT 6\.1; WOW64; Trident/4\.0; SLCC2; \.NET CLR 2\.0\.50727; InfoPath\.2\)
exclude Mozilla/4\.0 \(compatible; MSIE 7\.0; Windows NT 6\.1; SLCC2; \.NET CLR 2\.0\.50727; \.NET CLR 3\.5\.30729; \.NET CLR 3\.0\.30729; Media Center PC 6\.0; \.NET4\.0C; \.NET4\.0E\)
## 2020-06
exclude Mozilla/4\.0 \(compatible; MSIE 7\.0; Windows NT 6\.0; WOW64; SLCC1; \.NET CLR 3\.0\.04506\)
exclude Mozilla/5\.0 \(Windows NT 6\.1; APCPMS=\^N201610260943063673863E5829051FACE43F_150\^; Trident/7\.0; rv:11\.0\) like Gecko
exclude Mozilla/5\.0 \(Windows NT 6\.1; WOW64; rv:56\.0\) Gecko/20100101 Firefox/56\.0,gzip\(gfe\)
## 2020-08
exclude Mozilla/5\.0 \( compatible \)
## 2021-03
exclude Mozilla/5\.0
exclude Go-http-client/1\.1
exclude LanaiBotmarch
## 2021-04
exclude LanaiBotapr1
## 2021-05
exclude lanaibot .*
## 2021-06
exclude Java-http-client/.*
exclude Java/.*
exclude TinyTestBot
## 2021-07
exclude node-fetch/.* \(\+https://github\.com/bitinn/node-fetch\)
exclude TinyBotTestUA
## 2021-08
exclude .* archive.org_bot; Archive-It; \+http://archive-it\.org/files/site-owners\.html\)
## 2021-09
exclude Mozilla/4\.0 \(Mozilla/4\.0; MSIE 7\.0; Windows NT 5\.1; FDM; SV1\)
exclude Mozilla/5\.0 \(compatible; DataForSeoBot/1\.0; \+https://dataforseo\.com/dataforseo-bot\)
## 2022-01
exclude Wget/.* \(linux-gnu\)
## 2022-03
exclude zgtemple-bot
## 2022-07
exclude .* intelx\.io_bot \+https://intelx\.io\)
exclude mozilla
exclude .* Domains Project/.*; \+https://domainsproject\.org\)
## 2022-10
exclude WallabyupBot/.* \(\+https://wallabyup\.com/bot\.php\) Surfing code: AU
exclude Mozilla/5.0 \(compatible; archive\.org_bot/.* \+http://pandora\.nla\.gov\.au/crawl\.html\)
## 2023-01
exclude Playwright/.* \(x64; ubuntu .*\) python/.*
## 2023-02
exclude WallabyupBot/.* \(\+https://wallabyup\.au/bot\.php\) Surfing code: .*
#$ 10 hits/day
exclude Twitterbot/.*
exclude .* \(Applebot/.*; \+http://www\.apple\.com/go/applebot\)
exclude .* Mail\.RU_Bot/.*; \+http://go\.mail\.ru/help/robots\)
exclude weborama-fetcher \(\+http://www\.weborama\.com\)
exclude Mozilla/5\.0 \(compatible; YandexAccessibilityBot/3\.0; \+http://yandex\.com/bots\)
exclude facebookexternalhit/.* \(\+http://www\.facebook\.com/externalhit_uatext\.php\)
exclude .* linkdexbot/.*; \+http://www\.linkdex\.com/bots/\)
## 2021-05
exclude .* GeedoBot; \+http://www\.geedo\.com/bot\.html\)
exclude .* MojeekBot/.*; \+https://www\.mojeek\.com/bot\.html\)
exclude node-fetch
##### No recent hits
## ...
exclude Zepheira QA/.*
exclude DuckDuckBot/.*; \(\+http://duckduckgo\.com/duckduckbot\.html\)
exclude .* BLEXBot/.*; \+http://webmeup-crawler\.com/\)
exclude .* Exabot/.*; \+http://www\.exabot\.com/go/robot\)
exclude .* Konqueror/.*; Linux\) KHTML/.* \(like Gecko\) \(Exabot-Thumbnails\)
exclude .* MSIE .*; Windows NT .*; Trident/.*\) LinkCheck by Siteimprove\.com
exclude .* Yahoo! Slurp; http://help\.yahoo\.com/help/us/ysearch/slurp\)
exclude .* Yandex(Mobile)?Bot/.*; \+http://yandex\.com/bots\)
exclude Apache-HttpClient/.* \(Java/.*\)
## Yes, this is the entire UA.
exclude panscient\.com
exclude Slackbot-LinkExpanding .* \(\+https://api\.slack\.com/robots\)
exclude SocialRankIOBot; http://socialrank\.io/about
exclude ltx71.*\(http://ltx71\.com/\)