matchers.go 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. package strmatcher
  2. import (
  3. "errors"
  4. "regexp"
  5. "strings"
  6. "unicode/utf8"
  7. "golang.org/x/net/idna"
  8. )
  9. // FullMatcher is an implementation of Matcher.
  10. type FullMatcher string
  11. func (FullMatcher) Type() Type {
  12. return Full
  13. }
  14. func (m FullMatcher) Pattern() string {
  15. return string(m)
  16. }
  17. func (m FullMatcher) String() string {
  18. return "full:" + m.Pattern()
  19. }
  20. func (m FullMatcher) Match(s string) bool {
  21. return string(m) == s
  22. }
  23. // DomainMatcher is an implementation of Matcher.
  24. type DomainMatcher string
  25. func (DomainMatcher) Type() Type {
  26. return Domain
  27. }
  28. func (m DomainMatcher) Pattern() string {
  29. return string(m)
  30. }
  31. func (m DomainMatcher) String() string {
  32. return "domain:" + m.Pattern()
  33. }
  34. func (m DomainMatcher) Match(s string) bool {
  35. pattern := m.Pattern()
  36. if !strings.HasSuffix(s, pattern) {
  37. return false
  38. }
  39. return len(s) == len(pattern) || s[len(s)-len(pattern)-1] == '.'
  40. }
  41. // SubstrMatcher is an implementation of Matcher.
  42. type SubstrMatcher string
  43. func (SubstrMatcher) Type() Type {
  44. return Substr
  45. }
  46. func (m SubstrMatcher) Pattern() string {
  47. return string(m)
  48. }
  49. func (m SubstrMatcher) String() string {
  50. return "keyword:" + m.Pattern()
  51. }
  52. func (m SubstrMatcher) Match(s string) bool {
  53. return strings.Contains(s, m.Pattern())
  54. }
  55. // RegexMatcher is an implementation of Matcher.
  56. type RegexMatcher struct {
  57. pattern *regexp.Regexp
  58. }
  59. func (*RegexMatcher) Type() Type {
  60. return Regex
  61. }
  62. func (m *RegexMatcher) Pattern() string {
  63. return m.pattern.String()
  64. }
  65. func (m *RegexMatcher) String() string {
  66. return "regexp:" + m.Pattern()
  67. }
  68. func (m *RegexMatcher) Match(s string) bool {
  69. return m.pattern.MatchString(s)
  70. }
  71. // New creates a new Matcher based on the given pattern.
  72. func (t Type) New(pattern string) (Matcher, error) {
  73. switch t {
  74. case Full:
  75. return FullMatcher(pattern), nil
  76. case Substr:
  77. return SubstrMatcher(pattern), nil
  78. case Domain:
  79. pattern, err := ToDomain(pattern)
  80. if err != nil {
  81. return nil, err
  82. }
  83. return DomainMatcher(pattern), nil
  84. case Regex: // 1. regex matching is case-sensitive
  85. regex, err := regexp.Compile(pattern)
  86. if err != nil {
  87. return nil, err
  88. }
  89. return &RegexMatcher{pattern: regex}, nil
  90. default:
  91. return nil, errors.New("unknown matcher type")
  92. }
  93. }
  94. // NewDomainPattern creates a new Matcher based on the given domain pattern.
  95. // It works like `Type.New`, but will do validation and conversion to ensure it's a valid domain pattern.
  96. func (t Type) NewDomainPattern(pattern string) (Matcher, error) {
  97. switch t {
  98. case Full:
  99. pattern, err := ToDomain(pattern)
  100. if err != nil {
  101. return nil, err
  102. }
  103. return FullMatcher(pattern), nil
  104. case Substr:
  105. pattern, err := ToDomain(pattern)
  106. if err != nil {
  107. return nil, err
  108. }
  109. return SubstrMatcher(pattern), nil
  110. case Domain:
  111. pattern, err := ToDomain(pattern)
  112. if err != nil {
  113. return nil, err
  114. }
  115. return DomainMatcher(pattern), nil
  116. case Regex: // Regex's charset not in LDH subset
  117. regex, err := regexp.Compile(pattern)
  118. if err != nil {
  119. return nil, err
  120. }
  121. return &RegexMatcher{pattern: regex}, nil
  122. default:
  123. return nil, errors.New("unknown matcher type")
  124. }
  125. }
  126. // ToDomain converts input pattern to a domain string, and return error if such a conversion cannot be made.
  127. // 1. Conforms to Letter-Digit-Hyphen (LDH) subset (https://tools.ietf.org/html/rfc952):
  128. // * Letters A to Z (no distinction between uppercase and lowercase, we convert to lowers)
  129. // * Digits 0 to 9
  130. // * Hyphens(-) and Periods(.)
  131. // 2. If any non-ASCII characters, domain are converted from Internationalized domain name to Punycode.
  132. func ToDomain(pattern string) (string, error) {
  133. for {
  134. isASCII, hasUpper := true, false
  135. for i := 0; i < len(pattern); i++ {
  136. c := pattern[i]
  137. if c >= utf8.RuneSelf {
  138. isASCII = false
  139. break
  140. }
  141. switch {
  142. case 'A' <= c && c <= 'Z':
  143. hasUpper = true
  144. case 'a' <= c && c <= 'z':
  145. case '0' <= c && c <= '9':
  146. case c == '-':
  147. case c == '.':
  148. default:
  149. return "", errors.New("pattern string does not conform to Letter-Digit-Hyphen (LDH) subset")
  150. }
  151. }
  152. if !isASCII {
  153. var err error
  154. pattern, err = idna.Punycode.ToASCII(pattern)
  155. if err != nil {
  156. return "", err
  157. }
  158. continue
  159. }
  160. if hasUpper {
  161. pattern = strings.ToLower(pattern)
  162. }
  163. break
  164. }
  165. return pattern, nil
  166. }
  167. // MatcherGroupForAll is an interface indicating a MatcherGroup could accept all types of matchers.
  168. type MatcherGroupForAll interface {
  169. AddMatcher(matcher Matcher, value uint32)
  170. }
  171. // MatcherGroupForFull is an interface indicating a MatcherGroup could accept FullMatchers.
  172. type MatcherGroupForFull interface {
  173. AddFullMatcher(matcher FullMatcher, value uint32)
  174. }
  175. // MatcherGroupForDomain is an interface indicating a MatcherGroup could accept DomainMatchers.
  176. type MatcherGroupForDomain interface {
  177. AddDomainMatcher(matcher DomainMatcher, value uint32)
  178. }
  179. // MatcherGroupForSubstr is an interface indicating a MatcherGroup could accept SubstrMatchers.
  180. type MatcherGroupForSubstr interface {
  181. AddSubstrMatcher(matcher SubstrMatcher, value uint32)
  182. }
  183. // MatcherGroupForRegex is an interface indicating a MatcherGroup could accept RegexMatchers.
  184. type MatcherGroupForRegex interface {
  185. AddRegexMatcher(matcher *RegexMatcher, value uint32)
  186. }
  187. // AddMatcherToGroup is a helper function to try to add a Matcher to any kind of MatcherGroup.
  188. // It returns error if the MatcherGroup does not accept the provided Matcher's type.
  189. // This function is provided to help writing code to test a MatcherGroup.
  190. func AddMatcherToGroup(g MatcherGroup, matcher Matcher, value uint32) error {
  191. if g, ok := g.(IndexMatcher); ok {
  192. g.Add(matcher)
  193. return nil
  194. }
  195. if g, ok := g.(MatcherGroupForAll); ok {
  196. g.AddMatcher(matcher, value)
  197. return nil
  198. }
  199. switch matcher := matcher.(type) {
  200. case FullMatcher:
  201. if g, ok := g.(MatcherGroupForFull); ok {
  202. g.AddFullMatcher(matcher, value)
  203. return nil
  204. }
  205. case DomainMatcher:
  206. if g, ok := g.(MatcherGroupForDomain); ok {
  207. g.AddDomainMatcher(matcher, value)
  208. return nil
  209. }
  210. case SubstrMatcher:
  211. if g, ok := g.(MatcherGroupForSubstr); ok {
  212. g.AddSubstrMatcher(matcher, value)
  213. return nil
  214. }
  215. case *RegexMatcher:
  216. if g, ok := g.(MatcherGroupForRegex); ok {
  217. g.AddRegexMatcher(matcher, value)
  218. return nil
  219. }
  220. }
  221. return errors.New("cannot add matcher to matcher group")
  222. }
  223. // CompositeMatches flattens the matches slice to produce a single matched indices slice.
  224. // It is designed to avoid new memory allocation as possible.
  225. func CompositeMatches(matches [][]uint32) []uint32 {
  226. switch len(matches) {
  227. case 0:
  228. return nil
  229. case 1:
  230. return matches[0]
  231. default:
  232. result := make([]uint32, 0, 5)
  233. for i := 0; i < len(matches); i++ {
  234. result = append(result, matches[i]...)
  235. }
  236. return result
  237. }
  238. }
  239. // CompositeMatches flattens the matches slice to produce a single matched indices slice.
  240. // It is designed that:
  241. // 1. All matchers are concatenated in reverse order, so the matcher that matches further ranks higher.
  242. // 2. Indices in the same matcher keeps their original order.
  243. // 3. Avoid new memory allocation as possible.
  244. func CompositeMatchesReverse(matches [][]uint32) []uint32 {
  245. switch len(matches) {
  246. case 0:
  247. return nil
  248. case 1:
  249. return matches[0]
  250. default:
  251. result := make([]uint32, 0, 5)
  252. for i := len(matches) - 1; i >= 0; i-- {
  253. result = append(result, matches[i]...)
  254. }
  255. return result
  256. }
  257. }