1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
package detector
import (
"fmt"
"regexp"
"sort"
"strings"
)
var (
addressRE *regexp.Regexp
plzPrefilterRE = regexp.MustCompile(`\b\d{4,5}\b`)
)
var streetSuffixes = []string{
"chaussee", "promenade", "boulevard", "avenue",
"straße", "strasse", "gässchen", "gäßchen", "gässle", "gäßle", "gässli", "gaessli",
"str", "weg", "gasse", "allee", "ring", "damm", "pfad",
"steig", "stieg", "stiege", "steige", "steg", "zeile", "winkel",
"bogen", "ufer", "kai", "quai", "lände",
"platz", "markt",
"berg", "höhe", "halde", "hang", "grund", "graben", "tal", "thal",
"bach", "aue", "heide", "feld", "ried", "riet", "moos", "holz",
"forst", "schlag", "leiten", "bühel", "bühl", "büel", "egg", "horst", "weid",
"acker", "anger", "wiese", "matte", "rain", "trift", "kamp", "breite", "hecke",
"hof", "garten", "gärtle", "gärtli", "park", "mühle",
"brücke", "bruck", "brück", "tor", "hafen", "burg", "turm", "warte", "mauer",
"grün", "runde", "siedlung", "stall", "schanze", "staffel", "stutz",
"lehen", "rotte",
}
var streetPrepositions = []string{
"An der", "An dem", "An den", "An die",
"Auf der", "Auf dem", "Auf den", "Auf die",
"Bei der", "Bei dem", "Bei den",
"In der", "In dem", "In den", "In die",
"Unter der", "Unter dem", "Unter den", "Unter die",
"Hinter dem", "Hinter der", "Hinter den", "Hinter die",
"Neben dem", "Neben der",
"Vor dem", "Vor der", "Vor den", "Vor die",
"Zu den", "Ob der", "Ob dem",
"Im", "Am", "Beim", "Zur", "Zum",
}
func init() {
sort.Slice(streetSuffixes, func(i, j int) bool { return len(streetSuffixes[i]) > len(streetSuffixes[j]) })
sort.Slice(streetPrepositions, func(i, j int) bool { return len(streetPrepositions[i]) > len(streetPrepositions[j]) })
sParts := make([]string, len(streetSuffixes))
for i, s := range streetSuffixes {
sParts[i] = regexp.QuoteMeta(s)
}
pParts := make([]string, len(streetPrepositions))
for i, p := range streetPrepositions {
esc := regexp.QuoteMeta(p)
esc = strings.ReplaceAll(esc, `\ `, `\s+`)
pParts[i] = esc
}
pattern := fmt.Sprintf(
`(?i)(?:(?:%s)\s+)?(?:[A-ZÄÖÜ][a-zäöüß]+(?:[-][A-ZÄÖÜ]?[a-zäöüß]+)*)[-\s]*(?:%s)\.?\s+(?:\d+\s*[a-zA-Z]?(?:\s*/\s*\d+)?),?\s+(?:\d{5}|\d{4})\s+(?:[A-ZÄÖÜ][a-zäöüß]+(?:(?:\s+|-)[A-ZÄÖÜ]?[a-zäöüß]+){0,2})`,
strings.Join(pParts, "|"),
strings.Join(sParts, "|"),
)
addressRE = regexp.MustCompile(pattern)
}
func detectAddress(text string) []Finding {
if !plzPrefilterRE.MatchString(text) {
return nil
}
var out []Finding
for _, loc := range addressRE.FindAllStringIndex(text, -1) {
out = append(out, Finding{
Type: PiiAddress, Start: loc[0], End: loc[1],
Text: text[loc[0]:loc[1]], Confidence: 0.9,
})
}
return out
}
|