|
|
@@ -0,0 +1,297 @@
|
|
|
+package strmatcher
|
|
|
+
|
|
|
+import (
|
|
|
+ "math/bits"
|
|
|
+ "regexp"
|
|
|
+ "sort"
|
|
|
+ "strings"
|
|
|
+ "unsafe"
|
|
|
+)
|
|
|
+
|
|
|
+// PrimeRK is the prime base used in Rabin-Karp algorithm.
|
|
|
+const PrimeRK = 16777619
|
|
|
+
|
|
|
+// calculate the rolling murmurHash of given string
|
|
|
+func RollingHash(s string) uint32 {
|
|
|
+ h := uint32(0)
|
|
|
+ for i := len(s) - 1; i >= 0; i-- {
|
|
|
+ h = h*PrimeRK + uint32(s[i])
|
|
|
+ }
|
|
|
+ return h
|
|
|
+}
|
|
|
+
|
|
|
+// A MphMatcherGroup is divided into three parts:
|
|
|
+// 1. `full` and `domain` patterns are matched by Rabin-Karp algorithm and minimal perfect hash table;
|
|
|
+// 2. `substr` patterns are matched by ac automaton;
|
|
|
+// 3. `regex` patterns are matched with the regex library.
|
|
|
+type MphMatcherGroup struct {
|
|
|
+ ac *ACAutomaton
|
|
|
+ otherMatchers []matcherEntry
|
|
|
+ rules []string
|
|
|
+ level0 []uint32
|
|
|
+ level0Mask int
|
|
|
+ level1 []uint32
|
|
|
+ level1Mask int
|
|
|
+ count uint32
|
|
|
+ ruleMap *map[string]uint32
|
|
|
+}
|
|
|
+
|
|
|
+func (g *MphMatcherGroup) AddFullOrDomainPattern(pattern string, t Type) {
|
|
|
+ h := RollingHash(pattern)
|
|
|
+ switch t {
|
|
|
+ case Domain:
|
|
|
+ (*g.ruleMap)["."+pattern] = h*PrimeRK + uint32('.')
|
|
|
+ fallthrough
|
|
|
+ case Full:
|
|
|
+ (*g.ruleMap)[pattern] = h
|
|
|
+ default:
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func NewMphMatcherGroup() *MphMatcherGroup {
|
|
|
+ return &MphMatcherGroup{
|
|
|
+ ac: nil,
|
|
|
+ otherMatchers: nil,
|
|
|
+ rules: nil,
|
|
|
+ level0: nil,
|
|
|
+ level0Mask: 0,
|
|
|
+ level1: nil,
|
|
|
+ level1Mask: 0,
|
|
|
+ count: 1,
|
|
|
+ ruleMap: &map[string]uint32{},
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// AddPattern adds a pattern to MphMatcherGroup
|
|
|
+func (g *MphMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) {
|
|
|
+ switch t {
|
|
|
+ case Substr:
|
|
|
+ if g.ac == nil {
|
|
|
+ g.ac = NewACAutomaton()
|
|
|
+ }
|
|
|
+ g.ac.Add(pattern, t)
|
|
|
+ case Full, Domain:
|
|
|
+ pattern = strings.ToLower(pattern)
|
|
|
+ g.AddFullOrDomainPattern(pattern, t)
|
|
|
+ case Regex:
|
|
|
+ r, err := regexp.Compile(pattern)
|
|
|
+ if err != nil {
|
|
|
+ return 0, err
|
|
|
+ }
|
|
|
+ g.otherMatchers = append(g.otherMatchers, matcherEntry{
|
|
|
+ m: ®exMatcher{pattern: r},
|
|
|
+ id: g.count,
|
|
|
+ })
|
|
|
+ default:
|
|
|
+ panic("Unknown type")
|
|
|
+ }
|
|
|
+ return g.count, nil
|
|
|
+}
|
|
|
+
|
|
|
+// Build builds a minimal perfect hash table and ac automaton from insert rules
|
|
|
+func (g *MphMatcherGroup) Build() {
|
|
|
+ if g.ac != nil {
|
|
|
+ g.ac.Build()
|
|
|
+ }
|
|
|
+ keyLen := len(*g.ruleMap)
|
|
|
+ g.level0 = make([]uint32, nextPow2(keyLen/4))
|
|
|
+ g.level0Mask = len(g.level0) - 1
|
|
|
+ g.level1 = make([]uint32, nextPow2(keyLen))
|
|
|
+ g.level1Mask = len(g.level1) - 1
|
|
|
+ var sparseBuckets = make([][]int, len(g.level0))
|
|
|
+ var ruleIdx int
|
|
|
+ for rule, hash := range *g.ruleMap {
|
|
|
+ n := int(hash) & g.level0Mask
|
|
|
+ g.rules = append(g.rules, rule)
|
|
|
+ sparseBuckets[n] = append(sparseBuckets[n], ruleIdx)
|
|
|
+ ruleIdx++
|
|
|
+ }
|
|
|
+ g.ruleMap = nil
|
|
|
+ var buckets []indexBucket
|
|
|
+ for n, vals := range sparseBuckets {
|
|
|
+ if len(vals) > 0 {
|
|
|
+ buckets = append(buckets, indexBucket{n, vals})
|
|
|
+ }
|
|
|
+ }
|
|
|
+ sort.Sort(bySize(buckets))
|
|
|
+
|
|
|
+ occ := make([]bool, len(g.level1))
|
|
|
+ var tmpOcc []int
|
|
|
+ for _, bucket := range buckets {
|
|
|
+ var seed = uint32(0)
|
|
|
+ for {
|
|
|
+ findSeed := true
|
|
|
+ tmpOcc = tmpOcc[:0]
|
|
|
+ for _, i := range bucket.vals {
|
|
|
+ n := int(strhashFallback(unsafe.Pointer(&g.rules[i]), uintptr(seed))) & g.level1Mask
|
|
|
+ if occ[n] {
|
|
|
+ for _, n := range tmpOcc {
|
|
|
+ occ[n] = false
|
|
|
+ }
|
|
|
+ seed++
|
|
|
+ findSeed = false
|
|
|
+ break
|
|
|
+ }
|
|
|
+ occ[n] = true
|
|
|
+ tmpOcc = append(tmpOcc, n)
|
|
|
+ g.level1[n] = uint32(i)
|
|
|
+ }
|
|
|
+ if findSeed {
|
|
|
+ g.level0[bucket.n] = seed
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func nextPow2(v int) int {
|
|
|
+ if v <= 1 {
|
|
|
+ return 1
|
|
|
+ }
|
|
|
+ const MaxUInt = ^uint(0)
|
|
|
+ n := (MaxUInt >> bits.LeadingZeros(uint(v))) + 1
|
|
|
+ return int(n)
|
|
|
+}
|
|
|
+
|
|
|
+// Lookup searches for s in t and returns its index and whether it was found.
|
|
|
+func (g *MphMatcherGroup) Lookup(h uint32, s string) bool {
|
|
|
+ i0 := int(h) & g.level0Mask
|
|
|
+ seed := g.level0[i0]
|
|
|
+ i1 := int(strhashFallback(unsafe.Pointer(&s), uintptr(seed))) & g.level1Mask
|
|
|
+ n := g.level1[i1]
|
|
|
+ return s == g.rules[int(n)]
|
|
|
+}
|
|
|
+
|
|
|
+// Match implements IndexMatcher.Match.
|
|
|
+func (g *MphMatcherGroup) Match(pattern string) []uint32 {
|
|
|
+ result := []uint32{}
|
|
|
+ hash := uint32(0)
|
|
|
+ for i := len(pattern) - 1; i >= 0; i-- {
|
|
|
+ hash = hash*PrimeRK + uint32(pattern[i])
|
|
|
+ if pattern[i] == '.' {
|
|
|
+ if g.Lookup(hash, pattern[i:]) {
|
|
|
+ result = append(result, 1)
|
|
|
+ return result
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if g.Lookup(hash, pattern) {
|
|
|
+ result = append(result, 1)
|
|
|
+ return result
|
|
|
+ }
|
|
|
+ if g.ac != nil && g.ac.Match(pattern) {
|
|
|
+ result = append(result, 1)
|
|
|
+ return result
|
|
|
+ }
|
|
|
+ for _, e := range g.otherMatchers {
|
|
|
+ if e.m.Match(pattern) {
|
|
|
+ result = append(result, e.id)
|
|
|
+ return result
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+type indexBucket struct {
|
|
|
+ n int
|
|
|
+ vals []int
|
|
|
+}
|
|
|
+
|
|
|
+type bySize []indexBucket
|
|
|
+
|
|
|
+func (s bySize) Len() int { return len(s) }
|
|
|
+func (s bySize) Less(i, j int) bool { return len(s[i].vals) > len(s[j].vals) }
|
|
|
+func (s bySize) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
|
|
+
|
|
|
+type stringStruct struct {
|
|
|
+ str unsafe.Pointer
|
|
|
+ len int
|
|
|
+}
|
|
|
+
|
|
|
+func strhashFallback(a unsafe.Pointer, h uintptr) uintptr {
|
|
|
+ x := (*stringStruct)(a)
|
|
|
+ return memhashFallback(x.str, h, uintptr(x.len))
|
|
|
+}
|
|
|
+
|
|
|
+const (
|
|
|
+ // Constants for multiplication: four random odd 64-bit numbers.
|
|
|
+ m1 = 16877499708836156737
|
|
|
+ m2 = 2820277070424839065
|
|
|
+ m3 = 9497967016996688599
|
|
|
+ m4 = 15839092249703872147
|
|
|
+)
|
|
|
+
|
|
|
+var hashkey = [4]uintptr{1, 1, 1, 1}
|
|
|
+
|
|
|
+func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
|
|
|
+ h := uint64(seed + s*hashkey[0])
|
|
|
+tail:
|
|
|
+ switch {
|
|
|
+ case s == 0:
|
|
|
+ case s < 4:
|
|
|
+ h ^= uint64(*(*byte)(p))
|
|
|
+ h ^= uint64(*(*byte)(add(p, s>>1))) << 8
|
|
|
+ h ^= uint64(*(*byte)(add(p, s-1))) << 16
|
|
|
+ h = rotl31(h*m1) * m2
|
|
|
+ case s <= 8:
|
|
|
+ h ^= uint64(readUnaligned32(p))
|
|
|
+ h ^= uint64(readUnaligned32(add(p, s-4))) << 32
|
|
|
+ h = rotl31(h*m1) * m2
|
|
|
+ case s <= 16:
|
|
|
+ h ^= readUnaligned64(p)
|
|
|
+ h = rotl31(h*m1) * m2
|
|
|
+ h ^= readUnaligned64(add(p, s-8))
|
|
|
+ h = rotl31(h*m1) * m2
|
|
|
+ case s <= 32:
|
|
|
+ h ^= readUnaligned64(p)
|
|
|
+ h = rotl31(h*m1) * m2
|
|
|
+ h ^= readUnaligned64(add(p, 8))
|
|
|
+ h = rotl31(h*m1) * m2
|
|
|
+ h ^= readUnaligned64(add(p, s-16))
|
|
|
+ h = rotl31(h*m1) * m2
|
|
|
+ h ^= readUnaligned64(add(p, s-8))
|
|
|
+ h = rotl31(h*m1) * m2
|
|
|
+ default:
|
|
|
+ v1 := h
|
|
|
+ v2 := uint64(seed * hashkey[1])
|
|
|
+ v3 := uint64(seed * hashkey[2])
|
|
|
+ v4 := uint64(seed * hashkey[3])
|
|
|
+ for s >= 32 {
|
|
|
+ v1 ^= readUnaligned64(p)
|
|
|
+ v1 = rotl31(v1*m1) * m2
|
|
|
+ p = add(p, 8)
|
|
|
+ v2 ^= readUnaligned64(p)
|
|
|
+ v2 = rotl31(v2*m2) * m3
|
|
|
+ p = add(p, 8)
|
|
|
+ v3 ^= readUnaligned64(p)
|
|
|
+ v3 = rotl31(v3*m3) * m4
|
|
|
+ p = add(p, 8)
|
|
|
+ v4 ^= readUnaligned64(p)
|
|
|
+ v4 = rotl31(v4*m4) * m1
|
|
|
+ p = add(p, 8)
|
|
|
+ s -= 32
|
|
|
+ }
|
|
|
+ h = v1 ^ v2 ^ v3 ^ v4
|
|
|
+ goto tail
|
|
|
+ }
|
|
|
+
|
|
|
+ h ^= h >> 29
|
|
|
+ h *= m3
|
|
|
+ h ^= h >> 32
|
|
|
+ return uintptr(h)
|
|
|
+}
|
|
|
+func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
|
|
|
+ return unsafe.Pointer(uintptr(p) + x)
|
|
|
+}
|
|
|
+func readUnaligned32(p unsafe.Pointer) uint32 {
|
|
|
+ q := (*[4]byte)(p)
|
|
|
+ return uint32(q[0]) | uint32(q[1])<<8 | uint32(q[2])<<16 | uint32(q[3])<<24
|
|
|
+}
|
|
|
+
|
|
|
+func rotl31(x uint64) uint64 {
|
|
|
+ return (x << 31) | (x >> (64 - 31))
|
|
|
+}
|
|
|
+func readUnaligned64(p unsafe.Pointer) uint64 {
|
|
|
+ q := (*[8]byte)(p)
|
|
|
+ return uint64(q[0]) | uint64(q[1])<<8 | uint64(q[2])<<16 | uint64(q[3])<<24 | uint64(q[4])<<32 | uint64(q[5])<<40 | uint64(q[6])<<48 | uint64(q[7])<<56
|
|
|
+}
|