matchergroup_ac_automation.go 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. package strmatcher
  2. import (
  3. "container/list"
  4. )
  5. const (
  6. acValidCharCount = 39 // aA-zZ (26), 0-9 (10), - (1), . (1), invalid(1)
  7. acMatchTypeCount = 3 // Full, Domain and Substr
  8. )
  9. type acEdge byte
  10. const (
  11. acTrieEdge acEdge = 1
  12. acFailEdge acEdge = 0
  13. )
  14. type acNode struct {
  15. next [acValidCharCount]uint32 // EdgeIdx -> Next NodeIdx (Next trie node or fail node)
  16. edge [acValidCharCount]acEdge // EdgeIdx -> Trie Edge / Fail Edge
  17. fail uint32 // NodeIdx of *next matched* Substr Pattern on its fail path
  18. match uint32 // MatchIdx of matchers registered on this node, 0 indicates no match
  19. } // Sizeof acNode: (4+1)*acValidCharCount + <padding> + 4 + 4
  20. type acValue [acMatchTypeCount][]uint32 // MatcherType -> Registered Matcher Values
  21. // ACAutoMationMatcherGroup is an implementation of MatcherGroup.
  22. // It uses an AC Automata to provide support for Full, Domain and Substr matcher. Trie node is char based.
  23. //
  24. // NOTICE: ACAutomatonMatcherGroup currently uses a restricted charset (LDH Subset),
  25. // upstream should manually in a way to ensure all patterns and inputs passed to it to be in this charset.
  26. type ACAutomatonMatcherGroup struct {
  27. nodes []acNode // NodeIdx -> acNode
  28. values []acValue // MatchIdx -> acValue
  29. }
  30. func NewACAutomatonMatcherGroup() *ACAutomatonMatcherGroup {
  31. ac := new(ACAutomatonMatcherGroup)
  32. ac.addNode() // Create root node (NodeIdx 0)
  33. ac.addMatchEntry() // Create sentinel match entry (MatchIdx 0)
  34. return ac
  35. }
  36. // AddFullMatcher implements MatcherGroupForFull.AddFullMatcher.
  37. func (ac *ACAutomatonMatcherGroup) AddFullMatcher(matcher FullMatcher, value uint32) {
  38. ac.addPattern(0, matcher.Pattern(), matcher.Type(), value)
  39. }
  40. // AddDomainMatcher implements MatcherGroupForDomain.AddDomainMatcher.
  41. func (ac *ACAutomatonMatcherGroup) AddDomainMatcher(matcher DomainMatcher, value uint32) {
  42. node := ac.addPattern(0, matcher.Pattern(), matcher.Type(), value) // For full domain match
  43. ac.addPattern(node, ".", matcher.Type(), value) // For partial domain match
  44. }
  45. // AddSubstrMatcher implements MatcherGroupForSubstr.AddSubstrMatcher.
  46. func (ac *ACAutomatonMatcherGroup) AddSubstrMatcher(matcher SubstrMatcher, value uint32) {
  47. ac.addPattern(0, matcher.Pattern(), matcher.Type(), value)
  48. }
  49. func (ac *ACAutomatonMatcherGroup) addPattern(nodeIdx uint32, pattern string, matcherType Type, value uint32) uint32 {
  50. node := &ac.nodes[nodeIdx]
  51. for i := len(pattern) - 1; i >= 0; i-- {
  52. edgeIdx := acCharset[pattern[i]]
  53. nextIdx := node.next[edgeIdx]
  54. if nextIdx == 0 { // Add new Trie Edge
  55. nextIdx = ac.addNode()
  56. ac.nodes[nodeIdx].next[edgeIdx] = nextIdx
  57. ac.nodes[nodeIdx].edge[edgeIdx] = acTrieEdge
  58. }
  59. nodeIdx = nextIdx
  60. node = &ac.nodes[nodeIdx]
  61. }
  62. if node.match == 0 { // Add new match entry
  63. node.match = ac.addMatchEntry()
  64. }
  65. ac.values[node.match][matcherType] = append(ac.values[node.match][matcherType], value)
  66. return nodeIdx
  67. }
  68. func (ac *ACAutomatonMatcherGroup) addNode() uint32 {
  69. ac.nodes = append(ac.nodes, acNode{})
  70. return uint32(len(ac.nodes) - 1)
  71. }
  72. func (ac *ACAutomatonMatcherGroup) addMatchEntry() uint32 {
  73. ac.values = append(ac.values, acValue{})
  74. return uint32(len(ac.values) - 1)
  75. }
  76. func (ac *ACAutomatonMatcherGroup) Build() error {
  77. fail := make([]uint32, len(ac.nodes))
  78. queue := list.New()
  79. for edgeIdx := 0; edgeIdx < acValidCharCount; edgeIdx++ {
  80. if nextIdx := ac.nodes[0].next[edgeIdx]; nextIdx != 0 {
  81. queue.PushBack(nextIdx)
  82. }
  83. }
  84. for {
  85. front := queue.Front()
  86. if front == nil {
  87. break
  88. }
  89. queue.Remove(front)
  90. nodeIdx := front.Value.(uint32)
  91. node := &ac.nodes[nodeIdx] // Current node
  92. failNode := &ac.nodes[fail[nodeIdx]] // Fail node of currrent node
  93. for edgeIdx := 0; edgeIdx < acValidCharCount; edgeIdx++ {
  94. nodeIdx := node.next[edgeIdx] // Next node through trie edge
  95. failIdx := failNode.next[edgeIdx] // Next node through fail edge
  96. if nodeIdx != 0 {
  97. queue.PushBack(nodeIdx)
  98. fail[nodeIdx] = failIdx
  99. if match := ac.nodes[failIdx].match; match != 0 && len(ac.values[match][Substr]) > 0 { // Fail node is a Substr match node
  100. ac.nodes[nodeIdx].fail = failIdx
  101. } else { // Use path compression to reduce fail path to only contain match nodes
  102. ac.nodes[nodeIdx].fail = ac.nodes[failIdx].fail
  103. }
  104. } else { // Add new fail edge
  105. node.next[edgeIdx] = failIdx
  106. node.edge[edgeIdx] = acFailEdge
  107. }
  108. }
  109. }
  110. return nil
  111. }
  112. // Match implements MatcherGroup.Match.
  113. func (ac *ACAutomatonMatcherGroup) Match(input string) []uint32 {
  114. suffixMatches := make([][]uint32, 0, 5)
  115. substrMatches := make([][]uint32, 0, 5)
  116. fullMatch := true // fullMatch indicates no fail edge traversed so far.
  117. node := &ac.nodes[0] // start from root node.
  118. // 1. the match string is all through trie edge. FULL MATCH or DOMAIN
  119. // 2. the match string is through a fail edge. NOT FULL MATCH
  120. // 2.1 Through a fail edge, but there exists a valid node. SUBSTR
  121. for i := len(input) - 1; i >= 0; i-- {
  122. edge := acCharset[input[i]]
  123. fullMatch = fullMatch && (node.edge[edge] == acTrieEdge)
  124. node = &ac.nodes[node.next[edge]] // Advance to next node
  125. // When entering a new node, traverse the fail path to find all possible Substr patterns:
  126. // 1. The fail path is compressed to only contains match nodes and root node (for terminate condition).
  127. // 2. node.fail != 0 is added here for better performance (as shown by benchmark), possibly it helps branch prediction.
  128. if node.fail != 0 {
  129. for failIdx, failNode := node.fail, &ac.nodes[node.fail]; failIdx != 0; failIdx, failNode = failNode.fail, &ac.nodes[failIdx] {
  130. substrMatches = append(substrMatches, ac.values[failNode.match][Substr])
  131. }
  132. }
  133. // When entering a new node, check whether this node is a match.
  134. // For Substr matchers:
  135. // 1. Matched in any situation, whether a failNode edge is traversed or not.
  136. // For Domain matchers:
  137. // 1. Should not traverse any fail edge (fullMatch).
  138. // 2. Only check on dot separator (input[i] == '.').
  139. if node.match != 0 {
  140. values := ac.values[node.match]
  141. if len(values[Substr]) > 0 {
  142. substrMatches = append(substrMatches, values[Substr])
  143. }
  144. if fullMatch && input[i] == '.' && len(values[Domain]) > 0 {
  145. suffixMatches = append(suffixMatches, values[Domain])
  146. }
  147. }
  148. }
  149. // At the end of input, check if the whole string matches a pattern.
  150. // For Domain matchers:
  151. // 1. Exact match on Domain Matcher works like Full Match. e.g. foo.com is a full match for domain:foo.com.
  152. // For Full matchers:
  153. // 1. Only when no fail edge is traversed (fullMatch).
  154. // 2. Takes the highest priority (added at last).
  155. if fullMatch && node.match != 0 {
  156. values := ac.values[node.match]
  157. if len(values[Domain]) > 0 {
  158. suffixMatches = append(suffixMatches, values[Domain])
  159. }
  160. if len(values[Full]) > 0 {
  161. suffixMatches = append(suffixMatches, values[Full])
  162. }
  163. }
  164. if len(substrMatches) == 0 {
  165. return CompositeMatchesReverse(suffixMatches)
  166. }
  167. return CompositeMatchesReverse(append(substrMatches, suffixMatches...))
  168. }
  169. // MatchAny implements MatcherGroup.MatchAny.
  170. func (ac *ACAutomatonMatcherGroup) MatchAny(input string) bool {
  171. fullMatch := true
  172. node := &ac.nodes[0]
  173. for i := len(input) - 1; i >= 0; i-- {
  174. edge := acCharset[input[i]]
  175. fullMatch = fullMatch && (node.edge[edge] == acTrieEdge)
  176. node = &ac.nodes[node.next[edge]]
  177. if node.fail != 0 { // There is a match on this node's fail path
  178. return true
  179. }
  180. if node.match != 0 { // There is a match on this node
  181. values := ac.values[node.match]
  182. if len(values[Substr]) > 0 { // Substr match succeeds unconditionally
  183. return true
  184. }
  185. if fullMatch && input[i] == '.' && len(values[Domain]) > 0 { // Domain match only succeeds with dot separator on trie path
  186. return true
  187. }
  188. }
  189. }
  190. return fullMatch && node.match != 0 // At the end of input, Domain and Full match will succeed if no fail edge is traversed
  191. }
  192. // Letter-Digit-Hyphen (LDH) subset (https://tools.ietf.org/html/rfc952):
  193. // - Letters A to Z (no distinction is made between uppercase and lowercase)
  194. // - Digits 0 to 9
  195. // - Hyphens(-) and Periods(.)
  196. //
  197. // If for future the strmatcher are used for other scenarios than domain,
  198. // we could add a new Charset interface to represent variable charsets.
  199. var acCharset = [256]int{
  200. 'A': 1,
  201. 'a': 1,
  202. 'B': 2,
  203. 'b': 2,
  204. 'C': 3,
  205. 'c': 3,
  206. 'D': 4,
  207. 'd': 4,
  208. 'E': 5,
  209. 'e': 5,
  210. 'F': 6,
  211. 'f': 6,
  212. 'G': 7,
  213. 'g': 7,
  214. 'H': 8,
  215. 'h': 8,
  216. 'I': 9,
  217. 'i': 9,
  218. 'J': 10,
  219. 'j': 10,
  220. 'K': 11,
  221. 'k': 11,
  222. 'L': 12,
  223. 'l': 12,
  224. 'M': 13,
  225. 'm': 13,
  226. 'N': 14,
  227. 'n': 14,
  228. 'O': 15,
  229. 'o': 15,
  230. 'P': 16,
  231. 'p': 16,
  232. 'Q': 17,
  233. 'q': 17,
  234. 'R': 18,
  235. 'r': 18,
  236. 'S': 19,
  237. 's': 19,
  238. 'T': 20,
  239. 't': 20,
  240. 'U': 21,
  241. 'u': 21,
  242. 'V': 22,
  243. 'v': 22,
  244. 'W': 23,
  245. 'w': 23,
  246. 'X': 24,
  247. 'x': 24,
  248. 'Y': 25,
  249. 'y': 25,
  250. 'Z': 26,
  251. 'z': 26,
  252. '-': 27,
  253. '.': 28,
  254. '0': 29,
  255. '1': 30,
  256. '2': 31,
  257. '3': 32,
  258. '4': 33,
  259. '5': 34,
  260. '6': 35,
  261. '7': 36,
  262. '8': 37,
  263. '9': 38,
  264. }