Spell Checker

A solution to Bloom Filters kata described at CodeKata.

Solution

""" spell_checker.py  - an experiment with Bloom filters """

# usage: python3 spell_checker.py < text > in 2> out ; wc -l in out

import hashlib
import string
import sys


class WordBitArray():
    """ bitmap of words """
    def __init__(self, filename, size=5, cycles=3):
        self.size = size
        self.cycles = cycles
        self.array = 0
        with open(filename, 'r', encoding='utf-8') as text_file:
            dictionary = text_file.read()
        for entry in dictionary.split():
            for bit in self.bits(entry):
                self.array |= 1 << bit

    def bits(self, text):
        """ which bits """
        msg = hashlib.md5()
        msg.update(text.encode())
        return [int(msg.hexdigest()[i * self.size:(i + 1) * self.size],
                    base=16) for i in range(self.cycles)]

    def check(self, text):
        """ are text bits in array """
        alpha = ''.join([i for i in text if i in string.ascii_letters])
        key = 0
        for bit in self.bits(alpha):
            key |= 1 << bit
        if key == key & self.array:
            return True
        if alpha == alpha.lower():
            return False
        return self.check(text.lower())


if __name__ == '__main__':
    reference = WordBitArray('words', 5, 3)

    with sys.stdin as input_text:
        words = set(input_text.read().split())

    for word in words:
        output = sys.stderr
        if reference.check(word):
            output = sys.stdout
        print(word, file=output)

Checking for false positives

""" false_positives.py """

# usage: python3 false_positives.py 2> /dev/null

import itertools
import string
import sys
from spell_checker import WordBitArray


reference = WordBitArray('words', 5, 3)

with open('words', 'r', encoding='utf-8') as text_input:
    dictionary = text_input.read().split()

size = 5
counter = 0
for combo in itertools.permutations(string.ascii_lowercase, size):
    word = ''.join(combo)
    if reference.check(word) and word not in dictionary:
        counter += 1
        print(word, file=sys.stderr)

print(f'{counter} false positives from {pow(26, size)} permutations')
print(f'{100 * counter / pow(26, size):.2f}% are false positives')

Reference