Skip to main content

Aho Corasick

Aho–Corasick algorithm is a string-searching algorithm invented by Alfred V. Aho and Margaret J. Corasick. It is a kind of dictionary-matching algorithm that locates elements of a finite set of strings within an input text. It matches all strings simultaneously.

var AhoCorasick = function(keywords)
{
  this._buildTables(keywords);
};

AhoCorasick.prototype._buildTables = function(keywords)
{
  var gotoFn =
  {
       0: {}
  };
  var output = {};
  var state = 0;

  keywords.forEach(function(word)
  {
    var curr = 0;
    for (var i=0; i<word.length; i++)
    {
      var l = word[i];
      if (gotoFn[curr] && l in gotoFn[curr])
      {
        curr = gotoFn[curr][l];
      }
      else
      {
        state++;
        gotoFn[curr][l] = state;
        gotoFn[state] = {};
        curr = state;
        output[state] = [];
      }
    }

    output[curr].push(word);
  });

  var failure = {};
  var xs = [];

  for (var l in gotoFn[0])
  {
    var state = gotoFn[0][l];
    failure[state] = 0;
    xs.push(state);
  }

  while (xs.length)
  {
      var r = xs.shift();
      for (var l in gotoFn[r])
      {
        var s = gotoFn[r][l];
        xs.push(s);
        var state = failure[r];
        while(state > 0 && !(l in gotoFn[state]))
        {
          state = failure[state];
        }

        if (l in gotoFn[state])
        {
          var fs = gotoFn[state][l];
          failure[s] = fs;
          output[s] = output[s].concat(output[fs]);
        }
        else
        {
          failure[s] = 0;
        }
      }
  }

  this.gotoFn = gotoFn;
  this.output = output;
  this.failure = failure;
};

AhoCorasick.prototype.search = function(string)
 {
   var state = 0;
   var results = [];
   for (var i=0; i<string.length; i++)
   {
     var l = string[i];
     while (state > 0 && !(l in this.gotoFn[state]))
     {
       state = this.failure[state];
     }
     if (!(l in this.gotoFn[state]))
     {
       continue;
     }

     state = this.gotoFn[state][l];

     if (this.output[state].length)
     {
       var foundStrs = this.output[state];
       results.push([i, foundStrs]);
     }
   }
   return results;
};