-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathdetect-pii
More file actions
executable file
·93 lines (79 loc) · 6.92 KB
/
detect-pii
File metadata and controls
executable file
·93 lines (79 loc) · 6.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
"""
USAGE
detect-pii <threshold> < requests.jsonl
OPTIONS
threshold - The entity recognition score must be higher than this to return the result
SETUP
pip install presidio-analyzer
pip install presidio-anonymizer
python -m spacy download en_core_web_lg
EXAMPLES
$ echo '{"id": "1", "kind": "Text", "resource": "My name is Bill!"}' | ./detect-pii 0.5
Result:
{"request_id": "1", "score": 0.85, "entity_type": "PERSON", "entity_value": "Bill"}
NOTES
This may not work on python3.14 until a Spacy/Pydantic upstream issue is fixed
"""
import sys
import json
from presidio_analyzer import AnalyzerEngine
entities = [
"CREDIT_CARD", # A credit card number is between 12 to 19 digits. https://en.wikipedia.org/wiki/Payment_card_number
"CRYPTO", # A Crypto wallet number. Currently only Bitcoin address is supported Pattern match, context and checksum
"EMAIL_ADDRESS", # An email address identifies an email box to which email messages are delivered Pattern match, context and RFC-822 validation
"IBAN_CODE", # The International Bank Account Number (IBAN) is an internationally agreed system of identifying bank accounts across national borders
"IP_ADDRESS", # An Internet Protocol (IP) address (either IPv4 or IPv6). Pattern match, context and checksum
"NRP", # A person’s Nationality, religious or political group. Custom logic and context
"LOCATION", # Name of politically or geographically defined location (cities, provinces, countries, international regions, bodies of water, mountains Custom logic and context
"PERSON", # A full person name, which can include first names, middle names or initials, and last names. Custom logic and context
"PHONE_NUMBER", # A telephone number Custom logic, pattern match and context
"MEDICAL_LICENSE", # Common medical license numbers. Pattern match, context and checksum
"US_BANK_NUMBER", # A US bank account number is between 8 to 17 digits. Pattern match and context
"US_DRIVER_LICENSE", # A US driver license according to https://ntsi.com/drivers-license-format/ Pattern match and context
"US_ITIN", # US Individual Taxpayer Identification Number (ITIN). Nine digits that start with a "9" and contain a "7" or "8" as the 4 digit. Pattern match and context
"US_PASSPORT", # A US passport number with 9 digits. Pattern match and context
"US_SSN", # A US Social Security Number (SSN) with 9 digits. Pattern match and context
"UK_NHS", # A UK NHS number is 10 digits. Pattern match, context and checksum
"UK_NINO", # UK National Insurance Number is a unique identifier used in the administration of National Insurance and tax. Pattern match and context
"ES_NIF", # A spanish NIF number (Personal tax ID) . Pattern match, context and checksum
"ES_NIE", # A spanish NIE number (Foreigners ID card) . Pattern match, context and checksum
"IT_FISCAL_CODE", # An Italian personal identification code. https://en.wikipedia.org/wiki/Italian_fiscal_code Pattern match, context and checksum
"IT_DRIVER_LICENSE", # An Italian driver license number. Pattern match and context
"IT_VAT_CODE", # An Italian VAT code number Pattern match, context and checksum
"IT_PASSPORT", # An Italian passport number. Pattern match and context
"IT_IDENTITY_CARD", # An Italian identity card number. https://en.wikipedia.org/wiki/Italian_electronic_identity_card Pattern match and context
"PL_PESEL", # Polish PESEL number Pattern match, context and checksum
"SG_NRIC_FIN", # A National Registration Identification Card Pattern match and context
"SG_UEN", # A Unique Entity Number (UEN) is a standard identification number for entities registered in Singapore. Pattern match, context, and checksum
"AU_ABN", # The Australian Business Number (ABN) is a unique 11 digit identifier issued to all entities registered in the Australian Business Register (ABR)
"AU_ACN", # An Australian Company Number is a unique nine-digit number issued by the Australian Securities and Investments Commission
"AU_TFN", # The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity Pattern match, context, and checksum
"AU_MEDICARE", # Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system
"IN_PAN", # The Indian Permanent Account Number (PAN) is a unique 12 character alphanumeric identifier issued to all business and individual entities registered as Tax Payers
"IN_AADHAAR", # Indian government issued unique 12 digit individual identity number Pattern match, context, and checksum
"IN_VEHICLE_REGISTRATION", # Indian government issued transport (govt, personal, diplomatic, defence) vehicle registration number Pattern match, context, and checksum
"IN_VOTER", # Indian Election Commission issued 10 digit alpha numeric voter id for all indian citizens (age 18 or above) Pattern match, context
"IN_PASSPORT", # Indian Passport Number Pattern match, Context
"IN_GSTIN", # The Indian Goods and Services Tax Identification Number (GSTIN) is a 15-character identifier with state code (01-37), PAN, registration number, 'Z', and checksum.
"FI_PERSONAL_IDENTITY_CODE", # The Finnish Personal Identity Code (Henkilötunnus) is a unique 11 character individual identity number. Pattern match, context and custom logic.
"KR_RRN", # The Korean Resident Registration Number (RRN) is a 13-digit number issued to all Korean residents. Pattern match, context and custom logic.
"TH_TNIN", # The Thai National ID Number (TNIN) is a unique 13-digit number issued to all Thai residents. Pattern match, context and custom logic.
]
def main():
analyzer = AnalyzerEngine()
score = float(sys.argv[1])
for request in map(json.loads, sys.stdin):
text = request["resource"]
for analysis in analyzer.analyze(text=text, entities=entities, language='en'):
if analysis.score < score:
continue
result_data = {
"request_id": request["id"],
"score": analysis.score,
"entity_type": analysis.entity_type,
"entity_value": text[analysis.start:analysis.end],
}
print(json.dumps(result_data))
if __name__ == "__main__":
main()