Browse Source

A memory-efficient and fast hybrid matcher (#639)

* a faster DomainMatcher implementation

* rename benchmark name

* fix linting errors

* add hybrid matcher

* add rabin-karp algorithm

* rename test & fix linting errors

* add more comment

* format code

* revert `MatcherGroup` match func

* fix linting errors
DarthVader 4 years ago
parent
commit
a31a8e6f89
2 changed files with 48 additions and 5 deletions
  1. 1 1
      app/router/condition_test.go
  2. 47 4
      common/strmatcher/strmatcher.go

+ 1 - 1
app/router/condition_test.go

@@ -399,7 +399,7 @@ func TestChinaSites(t *testing.T) {
 	}
 }
 
-func BenchmarkACDomainMatcher(b *testing.B) {
+func BenchmarkHybridDomainMatcher(b *testing.B) {
 	domains, err := loadGeoSite("CN")
 	common.Must(err)
 

+ 47 - 4
common/strmatcher/strmatcher.go

@@ -4,6 +4,9 @@ import (
 	"regexp"
 )
 
+// PrimeRK is the prime base used in Rabin-Karp algorithm.
+const PrimeRK = 16777619
+
 // Matcher is the interface to determine a string matches a pattern.
 type Matcher interface {
 	// Match returns true if the given string matches a predefined pattern.
@@ -61,20 +64,42 @@ type matcherEntry struct {
 type ACAutomatonMatcherGroup struct {
 	count         uint32
 	ac            *ACAutomaton
+	nonSubstrMap  map[uint32]string
 	otherMatchers []matcherEntry
 }
 
 func NewACAutomatonMatcherGroup() *ACAutomatonMatcherGroup {
 	var g = new(ACAutomatonMatcherGroup)
 	g.count = 1
-	g.ac = NewACAutomaton()
+	g.nonSubstrMap = map[uint32]string{}
 	return g
 }
 
+// Add `full` or `domain` pattern to hashmap
+func (g *ACAutomatonMatcherGroup) AddFullOrDomainPattern(pattern string, t Type) {
+	h := uint32(0)
+	for i := len(pattern) - 1; i >= 0; i-- {
+		h = h*PrimeRK + uint32(pattern[i])
+	}
+	switch t {
+	case Full:
+		g.nonSubstrMap[h] = pattern
+	case Domain:
+		g.nonSubstrMap[h] = pattern
+		g.nonSubstrMap[h*PrimeRK+uint32('.')] = "." + pattern
+	default:
+	}
+}
+
 func (g *ACAutomatonMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) {
 	switch t {
-	case Full, Substr, Domain:
+	case Substr:
+		if g.ac == nil {
+			g.ac = NewACAutomaton()
+		}
 		g.ac.Add(pattern, t)
+	case Full, Domain:
+		g.AddFullOrDomainPattern(pattern, t)
 	case Regex:
 		g.count++
 		r, err := regexp.Compile(pattern)
@@ -92,18 +117,36 @@ func (g *ACAutomatonMatcherGroup) AddPattern(pattern string, t Type) (uint32, er
 }
 
 func (g *ACAutomatonMatcherGroup) Build() {
-	g.ac.Build()
+	if g.ac != nil {
+		g.ac.Build()
+	}
 }
 
 // Match implements IndexMatcher.Match.
 func (g *ACAutomatonMatcherGroup) Match(pattern string) []uint32 {
 	result := []uint32{}
-	if g.ac.Match(pattern) {
+	hash := uint32(0)
+	for i := len(pattern) - 1; i >= 0; i-- {
+		hash = hash*PrimeRK + uint32(pattern[i])
+		if pattern[i] == '.' {
+			if v, ok := g.nonSubstrMap[hash]; ok && v == pattern[i:] {
+				result = append(result, 1)
+				return result
+			}
+		}
+	}
+	if v, ok := g.nonSubstrMap[hash]; ok && v == pattern {
+		result = append(result, 1)
+		return result
+	}
+	if g.ac != nil && g.ac.Match(pattern) {
 		result = append(result, 1)
+		return result
 	}
 	for _, e := range g.otherMatchers {
 		if e.m.Match(pattern) {
 			result = append(result, e.id)
+			return result
 		}
 	}
 	return result