1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
|
// Package detector provides regex-based PII detection for DACH (DE/AT/CH) text.
// Supported types: EMAIL, PHONE, CREDIT_CARD, IBAN, PERSONAL_ID, SOCIAL_SECURITY,
// TAX_ID, VAT_ID, KVNR, LICENSE_PLATE, DRIVER_LICENSE, URL_SECRET, SECRET, ADDRESS.
package detector
import (
"fmt"
"sort"
"strings"
)
// PiiType identifies the kind of PII.
type PiiType string
const (
PiiEmail PiiType = "EMAIL"
PiiPhone PiiType = "PHONE"
PiiCreditCard PiiType = "CREDIT_CARD"
PiiIBAN PiiType = "IBAN"
PiiPersonalID PiiType = "PERSONAL_ID"
PiiSocialSec PiiType = "SOCIAL_SECURITY"
PiiTaxID PiiType = "TAX_ID"
PiiAddress PiiType = "ADDRESS"
PiiSecret PiiType = "SECRET"
PiiURLSecret PiiType = "URL_SECRET"
PiiKVNR PiiType = "KVNR"
PiiLicensePlate PiiType = "LICENSE_PLATE"
PiiVATID PiiType = "VAT_ID"
PiiDriverLicense PiiType = "DRIVER_LICENSE"
)
// piiPriority controls overlap resolution: higher value wins.
var piiPriority = map[PiiType]int{
PiiSecret: 6,
PiiURLSecret: 6,
PiiIBAN: 5,
PiiCreditCard: 5,
PiiSocialSec: 5,
PiiKVNR: 5,
PiiPersonalID: 4,
PiiTaxID: 4,
PiiEmail: 4,
PiiVATID: 4,
PiiDriverLicense: 4,
PiiPhone: 3,
PiiLicensePlate: 3,
PiiAddress: 2,
}
// Finding is a single PII detection result.
type Finding struct {
Type PiiType
Start int
End int
Text string
Confidence float64
Placeholder string
RuleID string // only set for SECRET findings
}
// Scanner runs all enabled detectors and produces anonymised text.
type Scanner struct {
enabledTypes map[PiiType]bool
allTypes bool
}
// NewScanner creates a Scanner. Pass an empty slice to enable all detectors.
func NewScanner(detectors []string) *Scanner {
if len(detectors) == 0 {
return &Scanner{allTypes: true}
}
m := make(map[PiiType]bool, len(detectors))
for _, d := range detectors {
m[PiiType(strings.ToUpper(d))] = true
}
return &Scanner{enabledTypes: m}
}
func (s *Scanner) isEnabled(t PiiType) bool {
if s.allTypes {
return true
}
return s.enabledTypes[t]
}
// Scan returns the anonymised text and the individual findings.
// Findings are sorted by start position (ascending) in the returned slice.
func (s *Scanner) Scan(text string) (string, []Finding) {
return s.ScanWithWhitelist(text, nil)
}
// ScanWithWhitelist behaves like Scan, but suppresses findings whose matched
// text contains any whitelist token (case-insensitive).
func (s *Scanner) ScanWithWhitelist(text string, whitelist []string) (string, []Finding) {
type entry struct {
t PiiType
fn func(string) []Finding
}
detectors := []entry{
{PiiSecret, detectSecrets},
{PiiURLSecret, detectURLSecret},
{PiiEmail, detectEmail},
{PiiPhone, detectPhone},
{PiiCreditCard, detectCreditCard},
{PiiIBAN, detectIBAN},
{PiiPersonalID, detectPersonalID},
{PiiSocialSec, detectSocialSecurity},
{PiiTaxID, detectTaxID},
{PiiVATID, detectVATID},
{PiiKVNR, detectKVNR},
{PiiLicensePlate, detectLicensePlate},
{PiiDriverLicense, detectDriverLicense},
{PiiAddress, detectAddress},
}
var all []Finding
for _, d := range detectors {
if s.isEnabled(d.t) {
all = append(all, d.fn(text)...)
}
}
if len(all) == 0 {
return text, nil
}
all = filterWhitelisted(all, whitelist)
if len(all) == 0 {
return text, nil
}
resolved := resolveOverlaps(all)
// Assign placeholders; identical text → identical placeholder.
counters := map[PiiType]int{}
seenText := map[string]string{}
for i := range resolved {
f := &resolved[i]
if ph, ok := seenText[f.Text]; ok {
f.Placeholder = ph
} else {
counters[f.Type]++
f.Placeholder = fmt.Sprintf("[%s_%d]", string(f.Type), counters[f.Type])
seenText[f.Text] = f.Placeholder
}
}
// Apply replacements right-to-left to preserve byte positions.
byPos := make([]Finding, len(resolved))
copy(byPos, resolved)
sort.Slice(byPos, func(i, j int) bool { return byPos[i].Start > byPos[j].Start })
result := []byte(text)
for _, f := range byPos {
result = append(result[:f.Start], append([]byte(f.Placeholder), result[f.End:]...)...)
}
// Return findings sorted ascending by start position.
sort.Slice(resolved, func(i, j int) bool { return resolved[i].Start < resolved[j].Start })
return string(result), resolved
}
func filterWhitelisted(findings []Finding, whitelist []string) []Finding {
tokens := make([]string, 0, len(whitelist))
for _, w := range whitelist {
w = strings.TrimSpace(strings.ToLower(w))
if w != "" {
tokens = append(tokens, w)
}
}
if len(tokens) == 0 {
return findings
}
filtered := make([]Finding, 0, len(findings))
for _, f := range findings {
txt := strings.ToLower(f.Text)
skip := false
for _, t := range tokens {
if strings.Contains(txt, t) {
skip = true
break
}
}
if !skip {
filtered = append(filtered, f)
}
}
return filtered
}
// resolveOverlaps keeps the highest-priority non-overlapping finding at each position.
func resolveOverlaps(findings []Finding) []Finding {
sort.SliceStable(findings, func(i, j int) bool {
fi, fj := findings[i], findings[j]
if fi.Start != fj.Start {
return fi.Start < fj.Start
}
pi, pj := piiPriority[fi.Type], piiPriority[fj.Type]
if pi != pj {
return pi > pj
}
return (fi.End - fi.Start) > (fj.End - fj.Start)
})
var result []Finding
lastEnd := -1
for _, f := range findings {
if f.Start >= lastEnd {
result = append(result, f)
lastEnd = f.End
} else if len(result) > 0 {
// Overlap: replace previous finding if current has higher priority or is longer at same priority.
prev := result[len(result)-1]
pi, pj := piiPriority[prev.Type], piiPriority[f.Type]
if pj > pi || (pj == pi && (f.End-f.Start) > (prev.End-prev.Start)) {
result[len(result)-1] = f
lastEnd = f.End
}
}
}
return result
}
|