matchergroup_mph.go 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. package strmatcher
  2. import (
  3. "math/bits"
  4. "sort"
  5. "strings"
  6. "unsafe"
  7. )
  8. // PrimeRK is the prime base used in Rabin-Karp algorithm.
  9. const PrimeRK = 16777619
  10. // calculate the rolling murmurHash of given string
  11. func RollingHash(s string) uint32 {
  12. h := uint32(0)
  13. for i := len(s) - 1; i >= 0; i-- {
  14. h = h*PrimeRK + uint32(s[i])
  15. }
  16. return h
  17. }
  18. // MphMatcherGroup is an implementation of MatcherGroup.
  19. // It implements Rabin-Karp algorithm and minimal perfect hash table for Full and Domain matcher.
  20. type MphMatcherGroup struct {
  21. rules []string
  22. level0 []uint32
  23. level0Mask int
  24. level1 []uint32
  25. level1Mask int
  26. ruleMap *map[string]uint32
  27. }
  28. func NewMphMatcherGroup() *MphMatcherGroup {
  29. return &MphMatcherGroup{
  30. rules: nil,
  31. level0: nil,
  32. level0Mask: 0,
  33. level1: nil,
  34. level1Mask: 0,
  35. ruleMap: &map[string]uint32{},
  36. }
  37. }
  38. // AddFullMatcher implements MatcherGroupForFull.
  39. func (g *MphMatcherGroup) AddFullMatcher(matcher FullMatcher, _ uint32) {
  40. pattern := strings.ToLower(matcher.Pattern())
  41. (*g.ruleMap)[pattern] = RollingHash(pattern)
  42. }
  43. // AddDomainMatcher implements MatcherGroupForDomain.
  44. func (g *MphMatcherGroup) AddDomainMatcher(matcher DomainMatcher, _ uint32) {
  45. pattern := strings.ToLower(matcher.Pattern())
  46. h := RollingHash(pattern)
  47. (*g.ruleMap)[pattern] = h
  48. (*g.ruleMap)["."+pattern] = h*PrimeRK + uint32('.')
  49. }
  50. // Build builds a minimal perfect hash table for insert rules.
  51. func (g *MphMatcherGroup) Build() {
  52. keyLen := len(*g.ruleMap)
  53. if keyLen == 0 {
  54. keyLen = 1
  55. (*g.ruleMap)["empty___"] = RollingHash("empty___")
  56. }
  57. g.level0 = make([]uint32, nextPow2(keyLen/4))
  58. g.level0Mask = len(g.level0) - 1
  59. g.level1 = make([]uint32, nextPow2(keyLen))
  60. g.level1Mask = len(g.level1) - 1
  61. sparseBuckets := make([][]int, len(g.level0))
  62. var ruleIdx int
  63. for rule, hash := range *g.ruleMap {
  64. n := int(hash) & g.level0Mask
  65. g.rules = append(g.rules, rule)
  66. sparseBuckets[n] = append(sparseBuckets[n], ruleIdx)
  67. ruleIdx++
  68. }
  69. g.ruleMap = nil
  70. var buckets []indexBucket
  71. for n, vals := range sparseBuckets {
  72. if len(vals) > 0 {
  73. buckets = append(buckets, indexBucket{n, vals})
  74. }
  75. }
  76. sort.Sort(bySize(buckets))
  77. occ := make([]bool, len(g.level1))
  78. var tmpOcc []int
  79. for _, bucket := range buckets {
  80. seed := uint32(0)
  81. for {
  82. findSeed := true
  83. tmpOcc = tmpOcc[:0]
  84. for _, i := range bucket.vals {
  85. n := int(strhashFallback(unsafe.Pointer(&g.rules[i]), uintptr(seed))) & g.level1Mask // nosemgrep
  86. if occ[n] {
  87. for _, n := range tmpOcc {
  88. occ[n] = false
  89. }
  90. seed++
  91. findSeed = false
  92. break
  93. }
  94. occ[n] = true
  95. tmpOcc = append(tmpOcc, n)
  96. g.level1[n] = uint32(i)
  97. }
  98. if findSeed {
  99. g.level0[bucket.n] = seed
  100. break
  101. }
  102. }
  103. }
  104. }
  105. // Lookup searches for s in t and returns its index and whether it was found.
  106. func (g *MphMatcherGroup) Lookup(h uint32, s string) bool {
  107. i0 := int(h) & g.level0Mask
  108. seed := g.level0[i0]
  109. i1 := int(strhashFallback(unsafe.Pointer(&s), uintptr(seed))) & g.level1Mask // nosemgrep
  110. n := g.level1[i1]
  111. return s == g.rules[int(n)]
  112. }
  113. // Match implements MatcherGroup.Match.
  114. func (*MphMatcherGroup) Match(_ string) []uint32 {
  115. return nil
  116. }
  117. // MatchAny implements MatcherGroup.MatchAny.
  118. func (g *MphMatcherGroup) MatchAny(pattern string) bool {
  119. hash := uint32(0)
  120. for i := len(pattern) - 1; i >= 0; i-- {
  121. hash = hash*PrimeRK + uint32(pattern[i])
  122. if pattern[i] == '.' {
  123. if g.Lookup(hash, pattern[i:]) {
  124. return true
  125. }
  126. }
  127. }
  128. return g.Lookup(hash, pattern)
  129. }
  130. func nextPow2(v int) int {
  131. if v <= 1 {
  132. return 1
  133. }
  134. const MaxUInt = ^uint(0)
  135. n := (MaxUInt >> bits.LeadingZeros(uint(v))) + 1
  136. return int(n)
  137. }
  138. type indexBucket struct {
  139. n int
  140. vals []int
  141. }
  142. type bySize []indexBucket
  143. func (s bySize) Len() int { return len(s) }
  144. func (s bySize) Less(i, j int) bool { return len(s[i].vals) > len(s[j].vals) }
  145. func (s bySize) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
  146. type stringStruct struct {
  147. str unsafe.Pointer
  148. len int
  149. }
  150. func strhashFallback(a unsafe.Pointer, h uintptr) uintptr {
  151. x := (*stringStruct)(a)
  152. return memhashFallback(x.str, h, uintptr(x.len))
  153. }
  154. const (
  155. // Constants for multiplication: four random odd 64-bit numbers.
  156. m1 = 16877499708836156737
  157. m2 = 2820277070424839065
  158. m3 = 9497967016996688599
  159. m4 = 15839092249703872147
  160. )
  161. var hashkey = [4]uintptr{1, 1, 1, 1}
  162. func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
  163. h := uint64(seed + s*hashkey[0])
  164. tail:
  165. switch {
  166. case s == 0:
  167. case s < 4:
  168. h ^= uint64(*(*byte)(p))
  169. h ^= uint64(*(*byte)(add(p, s>>1))) << 8
  170. h ^= uint64(*(*byte)(add(p, s-1))) << 16
  171. h = rotl31(h*m1) * m2
  172. case s <= 8:
  173. h ^= uint64(readUnaligned32(p))
  174. h ^= uint64(readUnaligned32(add(p, s-4))) << 32
  175. h = rotl31(h*m1) * m2
  176. case s <= 16:
  177. h ^= readUnaligned64(p)
  178. h = rotl31(h*m1) * m2
  179. h ^= readUnaligned64(add(p, s-8))
  180. h = rotl31(h*m1) * m2
  181. case s <= 32:
  182. h ^= readUnaligned64(p)
  183. h = rotl31(h*m1) * m2
  184. h ^= readUnaligned64(add(p, 8))
  185. h = rotl31(h*m1) * m2
  186. h ^= readUnaligned64(add(p, s-16))
  187. h = rotl31(h*m1) * m2
  188. h ^= readUnaligned64(add(p, s-8))
  189. h = rotl31(h*m1) * m2
  190. default:
  191. v1 := h
  192. v2 := uint64(seed * hashkey[1])
  193. v3 := uint64(seed * hashkey[2])
  194. v4 := uint64(seed * hashkey[3])
  195. for s >= 32 {
  196. v1 ^= readUnaligned64(p)
  197. v1 = rotl31(v1*m1) * m2
  198. p = add(p, 8)
  199. v2 ^= readUnaligned64(p)
  200. v2 = rotl31(v2*m2) * m3
  201. p = add(p, 8)
  202. v3 ^= readUnaligned64(p)
  203. v3 = rotl31(v3*m3) * m4
  204. p = add(p, 8)
  205. v4 ^= readUnaligned64(p)
  206. v4 = rotl31(v4*m4) * m1
  207. p = add(p, 8)
  208. s -= 32
  209. }
  210. h = v1 ^ v2 ^ v3 ^ v4
  211. goto tail
  212. }
  213. h ^= h >> 29
  214. h *= m3
  215. h ^= h >> 32
  216. return uintptr(h)
  217. }
  218. func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
  219. return unsafe.Pointer(uintptr(p) + x) // nosemgrep
  220. }
  221. func readUnaligned32(p unsafe.Pointer) uint32 {
  222. q := (*[4]byte)(p)
  223. return uint32(q[0]) | uint32(q[1])<<8 | uint32(q[2])<<16 | uint32(q[3])<<24
  224. }
  225. func rotl31(x uint64) uint64 {
  226. return (x << 31) | (x >> (64 - 31))
  227. }
  228. func readUnaligned64(p unsafe.Pointer) uint64 {
  229. q := (*[8]byte)(p)
  230. return uint64(q[0]) | uint64(q[1])<<8 | uint64(q[2])<<16 | uint64(q[3])<<24 | uint64(q[4])<<32 | uint64(q[5])<<40 | uint64(q[6])<<48 | uint64(q[7])<<56
  231. }