matchergroup_ac_automation.go 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. package strmatcher
  2. import (
  3. "container/list"
  4. )
  5. const (
  6. acValidCharCount = 38 // aA-zZ (26), 0-9 (10), - (1), . (1)
  7. acMatchTypeCount = 3 // Full, Domain and Substr
  8. )
  9. type acEdge byte
  10. const (
  11. acTrieEdge acEdge = 1
  12. acFailEdge acEdge = 0
  13. )
  14. type acNode struct {
  15. next [acValidCharCount]uint32 // EdgeIdx -> Next NodeIdx (Next trie node or fail node)
  16. edge [acValidCharCount]acEdge // EdgeIdx -> Trie Edge / Fail Edge
  17. fail uint32 // NodeIdx of *next matched* Substr Pattern on its fail path
  18. match uint32 // MatchIdx of matchers registered on this node, 0 indicates no match
  19. } // Sizeof acNode: (4+1)*acValidCharCount + <padding> + 4 + 4
  20. type acValue [acMatchTypeCount][]uint32 // MatcherType -> Registered Matcher Values
  21. // ACAutoMationMatcherGroup is an implementation of MatcherGroup.
  22. // It uses an AC Automata to provide support for Full, Domain and Substr matcher. Trie node is char based.
  23. //
  24. // NOTICE: ACAutomatonMatcherGroup currently uses a restricted charset (LDH Subset),
  25. // upstream should manually in a way to ensure all patterns and inputs passed to it to be in this charset.
  26. type ACAutomatonMatcherGroup struct {
  27. nodes []acNode // NodeIdx -> acNode
  28. values []acValue // MatchIdx -> acValue
  29. }
  30. func NewACAutomatonMatcherGroup() *ACAutomatonMatcherGroup {
  31. ac := new(ACAutomatonMatcherGroup)
  32. ac.addNode() // Create root node (NodeIdx 0)
  33. ac.addMatchEntry() // Create sentinel match entry (MatchIdx 0)
  34. return ac
  35. }
  36. // AddFullMatcher implements MatcherGroupForFull.AddFullMatcher.
  37. func (ac *ACAutomatonMatcherGroup) AddFullMatcher(matcher FullMatcher, value uint32) {
  38. ac.addPattern(0, matcher.Pattern(), matcher.Type(), value)
  39. }
  40. // AddDomainMatcher implements MatcherGroupForDomain.AddDomainMatcher.
  41. func (ac *ACAutomatonMatcherGroup) AddDomainMatcher(matcher DomainMatcher, value uint32) {
  42. node := ac.addPattern(0, matcher.Pattern(), matcher.Type(), value) // For full domain match
  43. ac.addPattern(node, ".", matcher.Type(), value) // For partial domain match
  44. }
  45. // AddSubstrMatcher implements MatcherGroupForSubstr.AddSubstrMatcher.
  46. func (ac *ACAutomatonMatcherGroup) AddSubstrMatcher(matcher SubstrMatcher, value uint32) {
  47. ac.addPattern(0, matcher.Pattern(), matcher.Type(), value)
  48. }
  49. func (ac *ACAutomatonMatcherGroup) addPattern(nodeIdx uint32, pattern string, matcherType Type, value uint32) uint32 {
  50. node := &ac.nodes[nodeIdx]
  51. for i := len(pattern) - 1; i >= 0; i-- {
  52. edgeIdx := acCharset[pattern[i]]
  53. nextIdx := node.next[edgeIdx]
  54. if nextIdx == 0 { // Add new Trie Edge
  55. nextIdx = ac.addNode()
  56. ac.nodes[nodeIdx].next[edgeIdx] = nextIdx
  57. ac.nodes[nodeIdx].edge[edgeIdx] = acTrieEdge
  58. }
  59. nodeIdx = nextIdx
  60. node = &ac.nodes[nodeIdx]
  61. }
  62. if node.match == 0 { // Add new match entry
  63. node.match = ac.addMatchEntry()
  64. }
  65. ac.values[node.match][matcherType] = append(ac.values[node.match][matcherType], value)
  66. return nodeIdx
  67. }
  68. func (ac *ACAutomatonMatcherGroup) addNode() uint32 {
  69. ac.nodes = append(ac.nodes, acNode{})
  70. return uint32(len(ac.nodes) - 1)
  71. }
  72. func (ac *ACAutomatonMatcherGroup) addMatchEntry() uint32 {
  73. ac.values = append(ac.values, acValue{})
  74. return uint32(len(ac.values) - 1)
  75. }
  76. func (ac *ACAutomatonMatcherGroup) Build() error {
  77. fail := make([]uint32, len(ac.nodes))
  78. queue := list.New()
  79. for edgeIdx := 0; edgeIdx < acValidCharCount; edgeIdx++ {
  80. if nextIdx := ac.nodes[0].next[edgeIdx]; nextIdx != 0 {
  81. queue.PushBack(nextIdx)
  82. }
  83. }
  84. for {
  85. front := queue.Front()
  86. if front == nil {
  87. break
  88. }
  89. queue.Remove(front)
  90. nodeIdx := front.Value.(uint32)
  91. node := &ac.nodes[nodeIdx] // Current node
  92. failNode := &ac.nodes[fail[nodeIdx]] // Fail node of currrent node
  93. for edgeIdx := 0; edgeIdx < acValidCharCount; edgeIdx++ {
  94. nodeIdx := node.next[edgeIdx] // Next node through trie edge
  95. failIdx := failNode.next[edgeIdx] // Next node through fail edge
  96. if nodeIdx != 0 {
  97. queue.PushBack(nodeIdx)
  98. fail[nodeIdx] = failIdx
  99. if match := ac.nodes[failIdx].match; match != 0 && len(ac.values[match][Substr]) > 0 { // Fail node is a Substr match node
  100. ac.nodes[nodeIdx].fail = failIdx
  101. } else { // Use path compression to reduce fail path to only contain match nodes
  102. ac.nodes[nodeIdx].fail = ac.nodes[failIdx].fail
  103. }
  104. } else { // Add new fail edge
  105. node.next[edgeIdx] = failIdx
  106. node.edge[edgeIdx] = acFailEdge
  107. }
  108. }
  109. }
  110. return nil
  111. }
  112. // Match implements MatcherGroup.Match.
  113. func (ac *ACAutomatonMatcherGroup) Match(input string) []uint32 {
  114. var suffixMatches [][]uint32
  115. var substrMatches [][]uint32
  116. fullMatch := true // fullMatch indicates no fail edge traversed so far.
  117. node := &ac.nodes[0] // start from root node.
  118. // 1. the match string is all through trie edge. FULL MATCH or DOMAIN
  119. // 2. the match string is through a fail edge. NOT FULL MATCH
  120. // 2.1 Through a fail edge, but there exists a valid node. SUBSTR
  121. for i := len(input) - 1; i >= 0; i-- {
  122. edge := acCharset[input[i]]
  123. fullMatch = fullMatch && (node.edge[edge] == acTrieEdge)
  124. node = &ac.nodes[node.next[edge]] // Advance to next node
  125. // When entering a new node, traverse the fail path to find all possible Substr patterns:
  126. // 1. The fail path is compressed to only contains match nodes and root node (for terminate condition).
  127. // 2. node.fail != 0 is added here for better performance (as shown by benchmark), possibly it helps branch prediction.
  128. if node.fail != 0 {
  129. for failIdx, failNode := node.fail, &ac.nodes[node.fail]; failIdx != 0; failIdx, failNode = failNode.fail, &ac.nodes[failIdx] {
  130. substrMatches = append(substrMatches, ac.values[failNode.match][Substr])
  131. }
  132. }
  133. // When entering a new node, check whether this node is a match.
  134. // For Substr matchers:
  135. // 1. Matched in any situation, whether a failNode edge is traversed or not.
  136. // For Domain matchers:
  137. // 1. Should not traverse any fail edge (fullMatch).
  138. // 2. Only check on dot separator (input[i] == '.').
  139. if node.match != 0 {
  140. values := ac.values[node.match]
  141. if len(values[Substr]) > 0 {
  142. substrMatches = append(substrMatches, values[Substr])
  143. }
  144. if fullMatch && input[i] == '.' && len(values[Domain]) > 0 {
  145. suffixMatches = append(suffixMatches, values[Domain])
  146. }
  147. }
  148. }
  149. // At the end of input, check if the whole string matches a pattern.
  150. // For Domain matchers:
  151. // 1. Exact match on Domain Matcher works like Full Match. e.g. foo.com is a full match for domain:foo.com.
  152. // For Full matchers:
  153. // 1. Only when no fail edge is traversed (fullMatch).
  154. // 2. Takes the highest priority (added at last).
  155. if fullMatch && node.match != 0 {
  156. values := ac.values[node.match]
  157. if len(values[Domain]) > 0 {
  158. suffixMatches = append(suffixMatches, values[Domain])
  159. }
  160. if len(values[Full]) > 0 {
  161. suffixMatches = append(suffixMatches, values[Full])
  162. }
  163. }
  164. switch matches := append(substrMatches, suffixMatches...); len(matches) { // nolint: gocritic
  165. case 0:
  166. return nil
  167. case 1:
  168. return matches[0]
  169. default:
  170. result := []uint32{}
  171. for i := len(matches) - 1; i >= 0; i-- {
  172. result = append(result, matches[i]...)
  173. }
  174. return result
  175. }
  176. }
  177. // MatchAny implements MatcherGroup.MatchAny.
  178. func (ac *ACAutomatonMatcherGroup) MatchAny(input string) bool {
  179. fullMatch := true
  180. node := &ac.nodes[0]
  181. for i := len(input) - 1; i >= 0; i-- {
  182. edge := acCharset[input[i]]
  183. fullMatch = fullMatch && (node.edge[edge] == acTrieEdge)
  184. node = &ac.nodes[node.next[edge]]
  185. if node.fail != 0 { // There is a match on this node's fail path
  186. return true
  187. }
  188. if node.match != 0 { // There is a match on this node
  189. values := ac.values[node.match]
  190. if len(values[Substr]) > 0 { // Substr match succeeds unconditionally
  191. return true
  192. }
  193. if fullMatch && input[i] == '.' && len(values[Domain]) > 0 { // Domain match only succeeds with dot separator on trie path
  194. return true
  195. }
  196. }
  197. }
  198. return fullMatch && node.match != 0 // At the end of input, Domain and Full match will succeed if no fail edge is traversed
  199. }
  200. // Letter-Digit-Hyphen (LDH) subset (https://tools.ietf.org/html/rfc952):
  201. // - Letters A to Z (no distinction is made between uppercase and lowercase)
  202. // - Digits 0 to 9
  203. // - Hyphens(-) and Periods(.)
  204. //
  205. // If for future the strmatcher are used for other scenarios than domain,
  206. // we could add a new Charset interface to represent variable charsets.
  207. var acCharset = []int{
  208. 'A': 0,
  209. 'a': 0,
  210. 'B': 1,
  211. 'b': 1,
  212. 'C': 2,
  213. 'c': 2,
  214. 'D': 3,
  215. 'd': 3,
  216. 'E': 4,
  217. 'e': 4,
  218. 'F': 5,
  219. 'f': 5,
  220. 'G': 6,
  221. 'g': 6,
  222. 'H': 7,
  223. 'h': 7,
  224. 'I': 8,
  225. 'i': 8,
  226. 'J': 9,
  227. 'j': 9,
  228. 'K': 10,
  229. 'k': 10,
  230. 'L': 11,
  231. 'l': 11,
  232. 'M': 12,
  233. 'm': 12,
  234. 'N': 13,
  235. 'n': 13,
  236. 'O': 14,
  237. 'o': 14,
  238. 'P': 15,
  239. 'p': 15,
  240. 'Q': 16,
  241. 'q': 16,
  242. 'R': 17,
  243. 'r': 17,
  244. 'S': 18,
  245. 's': 18,
  246. 'T': 19,
  247. 't': 19,
  248. 'U': 20,
  249. 'u': 20,
  250. 'V': 21,
  251. 'v': 21,
  252. 'W': 22,
  253. 'w': 22,
  254. 'X': 23,
  255. 'x': 23,
  256. 'Y': 24,
  257. 'y': 24,
  258. 'Z': 25,
  259. 'z': 25,
  260. '-': 26,
  261. '.': 27,
  262. '0': 28,
  263. '1': 29,
  264. '2': 30,
  265. '3': 31,
  266. '4': 32,
  267. '5': 33,
  268. '6': 34,
  269. '7': 35,
  270. '8': 36,
  271. '9': 37,
  272. }