Spell Checker
8 July 2024
A solution to Bloom Filters kata described at CodeKata.
Solution
""" spell_checker.py - an experiment with Bloom filters """
# usage: python3 spell_checker.py < text > in 2> out ; wc -l in out
import hashlib
import string
import sys
class WordBitArray():
""" bitmap of words """
def __init__(self, filename, size=5, cycles=3):
self.size = size
self.cycles = cycles
self.array = 0
with open(filename, 'r', encoding='utf-8') as text_file:
dictionary = text_file.read()
for entry in dictionary.split():
for bit in self.bits(entry):
self.array |= 1 << bit
def bits(self, text):
""" which bits """
msg = hashlib.md5()
msg.update(text.encode())
return [int(msg.hexdigest()[i * self.size:(i + 1) * self.size],
base=16) for i in range(self.cycles)]
def check(self, text):
""" are text bits in array """
alpha = ''.join([i for i in text if i in string.ascii_letters])
key = 0
for bit in self.bits(alpha):
key |= 1 << bit
if key == key & self.array:
return True
if alpha == alpha.lower():
return False
return self.check(text.lower())
if __name__ == '__main__':
reference = WordBitArray('words', 5, 3)
with sys.stdin as input_text:
words = set(input_text.read().split())
for word in words:
output = sys.stderr
if reference.check(word):
output = sys.stdout
print(word, file=output)
Checking for false positives
""" false_positives.py """
# usage: python3 false_positives.py 2> /dev/null
import itertools
import string
import sys
from spell_checker import WordBitArray
reference = WordBitArray('words', 5, 3)
with open('words', 'r', encoding='utf-8') as text_input:
dictionary = text_input.read().split()
size = 5
counter = 0
for combo in itertools.permutations(string.ascii_lowercase, size):
word = ''.join(combo)
if reference.check(word) and word not in dictionary:
counter += 1
print(word, file=sys.stderr)
print(f'{counter} false positives from {pow(26, size)} permutations')
print(f'{100 * counter / pow(26, size):.2f}% are false positives')
Reference
- Bloom filter - Wikipedia