• Jump To … +
    dawg.coffee max-heap.coffee index.coffee builder.coffee distance.coffee transducer.coffee mutate.coffee operations.coffee permutations.coffee truth-table.coffee
  • builder.coffee

  • ¶
    global =
      if typeof exports is 'object'
        exports
      else if typeof window is 'object'
        window
      else
        this
    
    global['levenshtein'] ||= {}
    
    if typeof require is 'function'
      {levenshtein: {MaxHeap}} = require '../collection/max-heap'
      {levenshtein: {Transducer}} = require './transducer'
      {levenshtein: {Dawg}} = require '../collection/dawg'
    else
      MaxHeap = global['levenshtein']['MaxHeap']
      Transducer = global['levenshtein']['Transducer']
      Dawg = global['levenshtein']['Dawg']
    
    fields =
  • ¶

    Dictionary of terms

      '_dictionary': new Dawg([])
  • ¶

    Search algorithm to use

      '_algorithm': 'standard'
  • ¶

    Sort the candidates as they are discovered

      '_sort_candidates': true
  • ¶

    If sort_candidates, then sort them in a case-insensitive fashion

      '_case_insensitive_sort': true
  • ¶

    Include the distance from the query term for each spelling candidate

      '_include_distance': true
  • ¶

    Maximum number of spelling candidates to return

      '_maximum_candidates': Infinity
  • ¶

    Customer comparator for the max-heap (optional). This should be an arity-2 function that accepts two pairs of [“term”, distance] values.

      '_custom_comparator': null
  • ¶

    Custom transform for spelling candidates (optional). This should be an arity-1 function that accepts a pair of [“term”, distance] values.

      '_custom_transform': null
  • ¶

    Maximum number of spelling errors that are tollerated. This can be overridden with the second parameter to Transducer.transduce(term, n)

      '_default_edit_distance': Infinity
    
    class Builder
      constructor: (source, attributes) ->
        if source instanceof Builder
          for own field of fields
            this[field] = source[field]
          for own attribute, value of attributes
            this['_' + attribute] = value
  • ¶

    The distance of each position in a state can be defined as follows:

    distance = w - i + e

    For every accepting position, it must be the case that w - i <= n - e. It follows directly that the distance of every accepted position must be no more than n:

    (w - i <= n - e) <=> (w - i + e <= n) <=> (distance <= n)

    The Levenshtein distance between any two terms is defined as the minimum edit distance between the two terms. Therefore, iterate over each position in an accepting state, and take the minimum distance among all its accepting positions as the corresponding Levenshtein distance.

      _minimum_distance: () ->
        if @['_algorithm'] is 'standard'
          (state, w) ->
            minimum = Infinity
            for [i,e] in state
              distance = w - i + e
              minimum = distance if distance < minimum
            minimum
        else
          (state, w) ->
            minimum = Infinity
            for [i,e,x] in state
              distance = w - i + e
              minimum = distance if x isnt 1 and distance < minimum
            minimum
    
      _comparator: () ->
        if typeof @['_custom_comparator'] is 'function'
          @['_custom_comparator']
        else if @['_sort_candidates']
  • ¶

    Sort by minimum distance from the query term.

          comparator = (a,b) -> a[1] - b[1]
  • ¶

    Sort in a case-insensitive manner.

          comparator = do (comparator) ->
            (a,b) ->
              comparator(a,b) || a[0].toLowerCase().localeCompare(b[0].toLowerCase())
  • ¶

    If the terms are the same, case-insensitive, then compare them in a case-sensitive manner.

          unless @['_case_insensitive_sort']
            comparator = do (comparator) ->
              (a,b) ->
                comparator(a,b) || a[0].localeCompare(b[0])
          comparator
        else
          () -> 0 #-> If we don't want to sort the matches, make all terms equal
    
      _transform: (comparator) ->
        transform =
          if typeof @['_custom_transform'] is 'function'
            @['_custom_transform']
          else if @['_include_distance'] is false
            (candidate) -> candidate[0]
    
        (matches) =>
          if isFinite @['_maximum_candidates']
            matches['sort']() #-> sorts in reverse
            matches = matches['heap']
          else if @['_sort_candidates']
            heap = matches
            matches = []
            matches.push heap['pop']() while heap['peek']() isnt null
          if typeof transform is 'function'
            i = -1; while (++i) < matches.length
              matches[i] = transform(matches[i])
          matches
    
      _initial_state: () ->
        if @['_algorithm'] is 'standard'
          [[0,0]]
        else
          [[0,0,0]]
  • ¶

    Accepts a state vector and sorts its elements in ascending order.

      _sort_for_transition: () ->
        comparator = (a,b) -> a[0] - b[0] || a[1] - b[1]
        if @['_algorithm'] in ['transposition', 'merge_and_split']
          comparator = do (comparator) ->
            (a,b) -> comparator(a,b) || a[2] - b[2]
        (state) -> state.sort(comparator)
    
      _index_of: (vector, k, i) ->
        j = 0
        while j < k
          return j if vector[i + j]
          j += 1
        return -1
  • ¶

    Accepts a maximum edit distance and returns a transition function that maps a position, state vector and table offset of the current state to its next state.

      _transition_for_position: () ->
        switch @['_algorithm']
          when 'standard' then (n) =>
            ([i,e], vector, offset) =>
              h = i - offset; w = vector.length
              if e < n
                if h <= w - 2
                  a = n - e + 1; b = w - h
                  k = if a < b then a else b
                  j = @_index_of(vector, k, h)
                  if j == 0
                    [
                      [(i + 1), e]
                    ]
                  else if j > 0
                    [
                      [i, (e + 1)]
                      [(i + 1), (e + 1)]
                      [(i + j + 1), (e + j)]
                    ]
                  else
                    [
                      [i, (e + 1)]
                      [(i + 1), (e + 1)]
                    ]
                else if h == w - 1
                  if vector[h]
                    [
                      [(i + 1), e]
                    ]
                  else
                    [
                      [i, (e + 1)]
                      [(i + 1), (e + 1)]
                    ]
                else # h == w
                  [
                    [i, (e + 1)]
                  ]
              else if e == n
                if h <= w - 1
                  if vector[h]
                    [
                      [(i + 1), n]
                    ]
                  else
                    null
                else
                  null
              else
                null
    
          when 'transposition' then (n) =>
            ([i,e,t], vector, offset) =>
              h = i - offset; w = vector.length
              if e == 0 < n
                if h <= w - 2
                  a = n - e + 1; b = w - h
                  k = if a < b then a else b
                  j = @_index_of(vector, k, h)
                  if j == 0
                    [
                      [(i + 1), 0, 0]
                    ]
                  else if j == 1
                    [
                      [i, 1, 0]
                      [i, 1, 1] # t-position
                      [(i + 1), 1, 0]
                      [(i + 2), 1, 0] # was [(i + j + 1), j, 0], but j=1
                    ]
                  else if j > 1
                    [
                      [i, 1, 0]
                      [(i + 1), 1, 0]
                      [(i + j + 1), j, 0]
                    ]
                  else
                    [
                      [i, 1, 0]
                      [(i + 1), 1, 0]
                    ]
                else if h == w - 1
                  if vector[h]
                    [
                      [(i + 1), 0, 0]
                    ]
                  else
                    [
                      [i, 1, 0]
                      [(i + 1), 1, 0]
                    ]
                else # h == w
                  [
                    [i, 1, 0]
                  ]
              else if 1 <= e < n
                if h <= w - 2
                  if t is 0 # [i,e] is not a t-position
                    a = n - e + 1; b = w - h
                    k = if a < b then a else b
                    j = @_index_of(vector, k, h)
                    if j == 0
                      [
                        [(i + 1), e, 0]
                      ]
                    else if j == 1
                      [
                        [i, (e + 1), 0]
                        [i, (e + 1), 1] # t-position
                        [(i + 1), (e + 1), 0]
                        [(i + 2), (e + 1), 0] # was [(i + j + 1), (e + j), 0], but j=1
                      ]
                    else if j > 1
                      [
                        [i, (e + 1), 0]
                        [(i + 1), (e + 1), 0]
                        [(i + j + 1), (e + j), 0]
                      ]
                    else
                      [
                        [i, (e + 1), 0]
                        [(i + 1), (e + 1), 0]
                      ]
                  else
                    if vector[h]
                      [
                        [(i + 2), e, 0]
                      ]
                    else
                      null
                else if h == w - 1
                  if vector[h]
                    [
                      [(i + 1), e, 0]
                    ]
                  else
                    [
                      [i, (e + 1), 0]
                      [(i + 1), (e + 1), 0]
                    ]
                else # h == w
                  [
                    [i, (e + 1), 0]
                  ]
              else
                if h <= w - 1 and t is 0
                  if vector[h]
                    [
                      [(i + 1), n, 0]
                    ]
                  else
                    null
                else if h <= w - 2 and t is 1 # [i,e] is a t-position
                  if vector[h]
                    [
                      [(i + 2), n, 0]
                    ]
                  else
                    null
                else # h == w
                  null
    
          when 'merge_and_split' then (n) =>
            ([i,e,s], vector, offset) =>
              h = i - offset; w = vector.length
              if e == 0 < n
                if h <= w - 2
                  if vector[h]
                    [
                      [(i + 1), e, 0]
                    ]
                  else
                    [
                      [i, (e + 1), 0]
                      [i, (e + 1), 1] # s-position
                      [(i + 1), (e + 1), 0]
                      [(i + 2), (e + 1), 0]
                    ]
                else if h == w - 1
                  if vector[h]
                    [
                      [(i + 1), e, 0]
                    ]
                  else
                    [
                      [i, (e + 1), 0]
                      [i, (e + 1), 1] # s-position
                      [(i + 1), (e + 1), 0]
                    ]
                else # h == w
                  [
                    [i, (e + 1), 0]
                  ]
              else if e < n
                if h <= w - 2
                  if s is 0
                    if vector[h]
                      [
                        [(i + 1), e, 0]
                      ]
                    else
                      [
                        [i, (e + 1), 0]
                        [i, (e + 1), 1] # s-position
                        [(i + 1), (e + 1), 0]
                        [(i + 2), (e + 1), 0]
                      ]
                  else # [i,e] is an s-position
                    [
                      [(i + 1), e, 0]
                    ]
                else if h == w - 1
                  if s is 0
                    if vector[h]
                      [
                        [(i + 1), e, 0]
                      ]
                    else
                      [
                        [i, (e + 1), 0]
                        [i, (e + 1), 1] # s-position
                        [(i + 1), (e + 1), 0]
                      ]
                  else # [i,e] is an s-position
                    [
                      [(i + 1), e, 0]
                    ]
                else # h == w
                  [
                    [i, (e + 1), 0]
                  ]
              else
                if h <= w - 1
                  if s is 0
                    if vector[h]
                      [
                        [(i + 1), n, 0]
                      ]
                    else
                      null
                  else # [i,e] is an s-position
                    [
                      [(i + 1), e, 0]
                    ]
                else # h == w
                  null
  • ¶

    Given two positions [i,e] and [j,f], for [i,e] to subsume [j,f], it must be the case that e < f. Therefore, I can remove a redundant check for (e < f) within the subsumes method by finding the first index that contains a position having an error greater than the current one (assuming that the positions are sorted in ascending order, according to error).

      _bisect_error_right: (state, e, l) ->
        u = state.length
        while l < u
          i = (l + u) >> 1
          if e < state[i][1]
            u = i
          else
            l = i + 1
        return l
  • ¶

    Removes all subsumed positions from a state

      _unsubsume: () =>
        subsumes = @_subsumes()
        bisect_error_right = @_bisect_error_right
        switch @['_algorithm']
          when 'standard'
            (state) ->
              m = 0
              while x = state[m]
                [i,e] = x; n = bisect_error_right(state, e, m)
                while y = state[n]
                  [j,f] = y
                  if subsumes(i,e, j,f)
                    state.splice(n,1)
                  else
                    n += 1
                m += 1
              return
          when 'transposition'
            (state) ->
              m = 0
              while x = state[m]
                [i,e,s] = x; n = bisect_error_right(state, e, m)
                while y = state[n]
                  [j,f,t] = y
                  if subsumes(i,e,s, j,f,t, n)
                    state.splice(n,1)
                  else
                    n += 1
                m += 1
              return
          when 'merge_and_split'
            (state) ->
              m = 0
              while x = state[m]
                [i,e,s] = x; n = bisect_error_right(state, e, m)
                while y = state[n]
                  [j,f,t] = y
                  if subsumes(i,e,s, j,f,t, n)
                    state.splice(n,1)
                  else
                    n += 1
                m += 1
              return
  • ¶

    NOTE: See my comment above bisect_error_right(state,e,l) and how I am using it in _unsubsume for why I am not checking (e < f) below.

      _subsumes: () ->
        switch @['_algorithm']
          when 'standard' then (i,e, j,f) ->
  • ¶

    (e < f) && Math.abs(j - i) <= (f - e)

            ((i < j) && (j - i) || (i - j)) <= (f - e)
          when 'transposition' then (i,e,s, j,f,t, n) ->
            if s is 1
              if t is 1
  • ¶

    (e < f) && (i == j)

                (i == j)
              else
  • ¶

    (e < f == n) && (i == j)

                (f == n) && (i == j)
            else
              if t is 1
  • ¶

    We have two cases:

    Case 1: (j < i) => (j - i) = - (i - j) => |j - (i - 1)| = |j - i + 1| = |-(i - j) + 1| = |-(i - j - 1)| = i - j - 1

    Case 1 holds, because i and j are integers, and j < i implies i is at least 1 unit greater than j, further implying that i - j - 1 is non-negative.

    Case 2: (j >= i) => |j - (i - 1)| = |j - i + 1| = j - i + 1

    Case 2 holds for the same reason case 1 does, in that j - i >= 0, and adding 1 to the difference will only strengthen its non-negativity.

    Math.abs(j - (i - 1)) <= (f - e);

                (if (j < i) then (i - j - 1) else (j - i + 1)) <= (f - e)
              else
  • ¶

    (e < f) && Math.abs(j - i) <= (f - e)

                ((i < j) && (j - i) || (i - j)) <= (f - e)
          when 'merge_and_split' then(i,e,s, j,f,t) ->
            if s is 1 and t is 0
              false
            else
  • ¶

    (e < f) && Math.abs(j - i) <= (f - e)

              ((i < j) && (j - i) || (i - j)) <= (f - e)
    
      _bisect_left: () ->
        if @['_algorithm']
          (state, position) ->
            [i,e] = position; l = 0; u = state.length
            while l < u
              k = (l + u) >> 1
              p = state[k]
              if (e - p[1] || i - p[0]) > 0
                l = k + 1
              else
                u = k
            return l
        else
          (state, position) ->
            [i,e,x] = position; l = 0; u = state.length
            while l < u
              k = (l + u) >> 1
              p = state[k]
              if (e - p[1] || i - p[0] || x - p[2]) > 0
                l = k + 1
              else
                u = k
            return l
  • ¶

    Merges the positions of next_state into state_prime, in a subsumption-friendly manner.

      _merge_for_subsumption: () ->
        bisect_left = @_bisect_left()
        if @['_algorithm'] is 'standard'
          (state_prime, next_state) ->
  • ¶

    Order according to error first, then boundary (both ascending). While sorting the elements, remove any duplicates.

            for position in next_state
              i = bisect_left(state_prime, position)
              if curr = state_prime[i]
                if curr[0] != position[0] || curr[1] != position[1]
                  state_prime.splice(i, 0, position)
              else
                state_prime.push(position)
            return
        else
          (state_prime, next_state) ->
  • ¶

    Order according to error first, then boundary (both ascending). While sorting the elements, remove any duplicates.

            for position in next_state
              i = bisect_left(state_prime, position)
              if curr = state_prime[i]
                if curr[0] != position[0] || curr[1] != position[1] || curr[2] != position[2]
                  state_prime.splice(i, 0, position)
              else
                state_prime.push(position)
            return
    
      _transition_for_state: () ->
        merge_for_subsumption = @_merge_for_subsumption()
        unsubsume = @_unsubsume()
        transition_for_position = @_transition_for_position()
        sort_for_transition = @_sort_for_transition()
        (n) ->
          transition = transition_for_position(n)
          (state, vector) =>
            offset = state[0][0]; state_prime = []
    
            for position in state
              next_state = transition(position, vector, offset)
              continue unless next_state
              merge_for_subsumption(state_prime, next_state)
            unsubsume(state_prime)
    
            if state_prime.length > 0
              sort_for_transition(state_prime)
              state_prime
            else
              null
    
      _characteristic_vector: () ->
        (x, term, k, i) ->
          vector = []; j = 0
          while j < k
            vector.push(x is term[i + j])
            j += 1
          vector
    
      _push: (compare) ->
        maximum_candidates = @['_maximum_candidates']
        if isFinite maximum_candidates
          (candidates, candidate) ->
            if candidates.length is maximum_candidates
  • ¶

    We are maintaining a max-heap so that the element furthest from the query term will be on the top. If the new candidate is closer to the query term then it should replace the old one.

              if compare(candidate, candidates['peek']()) < 0
                candidates['pop']()
                candidates.push(candidate)
            else
              candidates.push(candidate)
            candidates
        else
          (candidates, candidate) ->
            candidates.push(candidate)
            candidates
    
      'build': () ->
        comparator = @_comparator()
        new Transducer({
          'minimum_distance': @_minimum_distance()
          'build_matches': do =>
            if isFinite @['_maximum_candidates']
              () -> new MaxHeap(comparator)
            else if @['_sort_candidates']
              () -> new MaxHeap (a,b) -> - comparator(a,b)
            else
              () -> []
          'transition_for_state': @_transition_for_state()
          'characteristic_vector': @_characteristic_vector()
          'edges': (dawg_node) -> dawg_node['edges']
          'is_final': (dawg_node) -> dawg_node['is_final']
          'root': do (dawg = @['_dictionary']) ->
            () -> dawg['root']
          'initial_state': do (initial_state=@_initial_state()) ->
            () => initial_state
          'push': @_push(comparator)
          'default_edit_distance': () => @['default_edit_distance']()
          'transform': @_transform(comparator)
        })
  • ¶

    Aliases Builder::transducer to Builder::build, for those who prefer the syntax, builder.transducer(), over builder.build()

    Builder::['transducer'] = Builder::['build']
  • ¶

    Initialize the default, property values

    for own property, value of fields
      Builder::[property] = value
  • ¶

    Performs no operation

    noop = () -> return
  • ¶

    Identity function: returns whatever you give it

    identity = (x) -> x
    
    def_property = def_properties = (properties, params; property, i) ->
      [validate, translate] = [params['validate'], params['translate']]
      if typeof properties is 'string'
        properties = [properties]
      unless properties instanceof Array
        throw new Error('Expected "properties" to be of type Array')
      if validate isnt `undefined` and typeof validate isnt 'function'
        throw new Error('Expected "validate" to be of type Function')
      if translate isnt `undefined` and typeof translate isnt 'function'
        throw new Error('Expected "translate" to be of type Function')
    
      validate ||= noop
      translate ||= identity
    
      for property, i in properties
        if typeof property isnt 'string'
          throw new Error(
            "Expected property at index #{i} of properties to be of type String")
        do (property) ->
          field = '_' + property
          Builder::[property] =
            (value, opts...) ->
              if value is `undefined`
                @[field]
              else
                validate(value, opts, property)
                value = translate(value, opts, property)
                attributes = {}
                attributes[property] = value
                new Builder(this, attributes)
      true
    
    def_property 'dictionary',
      'validate': (dictionary) ->
        unless dictionary instanceof Array or dictionary instanceof Dawg
          throw new Error('dictionary must be either an Array or Dawg')
      'translate': (dictionary, [sorted]) ->
        if dictionary instanceof Array
          dictionary.sort() unless sorted is true
          dictionary = new Dawg(dictionary)
        dictionary
    
    def_property 'algorithm',
      'validate': (algorithm) ->
        unless algorithm in ['standard', 'transposition', 'merge_and_split']
          throw new Error(
            'algorithm must be standard, transposition, or merge_and_split')
    
    def_properties ['sort_candidates', 'case_insensitive_sort', 'include_distance'],
      'validate': (value, _, property) ->
        unless typeof value is 'boolean'
          throw new Error("Expected type of \"#{property}\" to be boolean")
    
    def_properties ['maximum_candidates', 'default_edit_distance'],
      'validate': (value, _, property) ->
        unless typeof value is 'number' and 0 <= value
          throw new Error("Expected \"#{property}\" to be a non-negative number")
    
    def_properties ['custom_comparator', 'custom_transform'],
      'validate': (value, _, property) ->
        unless typeof value is 'function'
          throw new Error("Expected \"#{property}\" to be a function")
    
    global['levenshtein']['Builder'] = Builder