Öffentliche Dateiansicht: Raw-Dateien, Tree, Releases und Issues sind ohne Login verfügbar.
detector/scanner.go Raw
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
// Package detector scans text for leaked secrets and credentials.
package detector

import (
	"fmt"
	"sort"
	"strings"
)

// PiiType identifies the kind of finding.
type PiiType string

const (
	PiiSecret PiiType = "SECRET"
)

// piiPriority controls overlap resolution: higher value wins.
var piiPriority = map[PiiType]int{
	PiiSecret: 6,
}

// Finding is a single detection result.
type Finding struct {
	Type        PiiType
	Start       int
	End         int
	Text        string
	Confidence  float64
	Placeholder string
	RuleID      string // only set for SECRET findings
}

// Scanner runs enabled detectors over text.
type Scanner struct {
	enabledTypes map[PiiType]bool
	allTypes     bool
}

// NewScanner creates a Scanner. Pass an empty slice to enable all detectors.
func NewScanner(detectors []string) *Scanner {
	if len(detectors) == 0 {
		return &Scanner{allTypes: true}
	}
	m := make(map[PiiType]bool, len(detectors))
	for _, d := range detectors {
		m[PiiType(strings.ToUpper(d))] = true
	}
	return &Scanner{enabledTypes: m}
}

func (s *Scanner) isEnabled(t PiiType) bool {
	if s.allTypes {
		return true
	}
	return s.enabledTypes[t]
}

// Scan returns the anonymised text and the individual findings.
// Findings are sorted by start position (ascending) in the returned slice.
func (s *Scanner) Scan(text string) (string, []Finding) {
	return s.ScanWithWhitelist(text, nil)
}

// ScanWithWhitelist behaves like Scan, but suppresses findings whose matched
// text contains any whitelist token (case-insensitive).
func (s *Scanner) ScanWithWhitelist(text string, whitelist []string) (string, []Finding) {
	type entry struct {
		t  PiiType
		fn func(string) []Finding
	}
	detectors := []entry{
		{PiiSecret, detectSecrets},
	}

	var all []Finding
	for _, d := range detectors {
		if s.isEnabled(d.t) {
			all = append(all, d.fn(text)...)
		}
	}
	if len(all) == 0 {
		return text, nil
	}

	all = filterWhitelisted(all, whitelist)
	if len(all) == 0 {
		return text, nil
	}

	resolved := resolveOverlaps(all)

	// Assign placeholders; identical text → identical placeholder.
	counters := map[PiiType]int{}
	seenText := map[string]string{}
	for i := range resolved {
		f := &resolved[i]
		if ph, ok := seenText[f.Text]; ok {
			f.Placeholder = ph
		} else {
			counters[f.Type]++
			f.Placeholder = fmt.Sprintf("[%s_%d]", string(f.Type), counters[f.Type])
			seenText[f.Text] = f.Placeholder
		}
	}

	// Apply replacements right-to-left to preserve byte positions.
	byPos := make([]Finding, len(resolved))
	copy(byPos, resolved)
	sort.Slice(byPos, func(i, j int) bool { return byPos[i].Start > byPos[j].Start })

	result := []byte(text)
	for _, f := range byPos {
		result = append(result[:f.Start], append([]byte(f.Placeholder), result[f.End:]...)...)
	}

	// Return findings sorted ascending by start position.
	sort.Slice(resolved, func(i, j int) bool { return resolved[i].Start < resolved[j].Start })
	return string(result), resolved
}

func filterWhitelisted(findings []Finding, whitelist []string) []Finding {
	tokens := make([]string, 0, len(whitelist))
	for _, w := range whitelist {
		w = strings.TrimSpace(strings.ToLower(w))
		if w != "" {
			tokens = append(tokens, w)
		}
	}
	if len(tokens) == 0 {
		return findings
	}
	filtered := make([]Finding, 0, len(findings))
	for _, f := range findings {
		txt := strings.ToLower(f.Text)
		skip := false
		for _, t := range tokens {
			if strings.Contains(txt, t) {
				skip = true
				break
			}
		}
		if !skip {
			filtered = append(filtered, f)
		}
	}
	return filtered
}

// resolveOverlaps keeps the highest-priority non-overlapping finding at each position.
func resolveOverlaps(findings []Finding) []Finding {
	sort.SliceStable(findings, func(i, j int) bool {
		fi, fj := findings[i], findings[j]
		if fi.Start != fj.Start {
			return fi.Start < fj.Start
		}
		pi, pj := piiPriority[fi.Type], piiPriority[fj.Type]
		if pi != pj {
			return pi > pj
		}
		return (fi.End - fi.Start) > (fj.End - fj.Start)
	})
	var result []Finding
	lastEnd := -1
	for _, f := range findings {
		if f.Start >= lastEnd {
			result = append(result, f)
			lastEnd = f.End
		} else if len(result) > 0 {
			prev := result[len(result)-1]
			pi, pj := piiPriority[prev.Type], piiPriority[f.Type]
			if pj > pi || (pj == pi && (f.End-f.Start) > (prev.End-prev.Start)) {
				result[len(result)-1] = f
				lastEnd = f.End
			}
		}
	}
	return result
}