-
Notifications
You must be signed in to change notification settings - Fork 0
/
spellchecker.c
133 lines (102 loc) · 3.03 KB
/
spellchecker.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/*
* TODO:
* Check if the file operations succeeded
* DRY out next_word_in_dict and next_word_in_text
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdbool.h>
#include "bloom_filter.h"
#define INITIAL_WORD_BUFFER_LEN 6
#define ESTIMATED_DICT_SIZE 150000
#define FALSE_POSITIVE_PROBABILITY 0.01
bloom_filter_t *get_filter_from_dict(FILE *dict);
/*
* Return the next word from the dict stream
* It's assumed that words are seperated using new lines
*/
char *next_word_in_dict(FILE *dict);
bool is_word_char(char c);
/*
* Return the next word from the text stream(as lowercase).
* A word is a sequence of characters and apostrophes.
* Everything else gets ignored.
*/
char *next_word_in_text(FILE *text);
int main(int argc, char **argv) {
if (argc != 3) {
fprintf(stderr, "Usage: %s dictionary text\n", argv[0]);
return EXIT_FAILURE;
}
FILE *dictionary;
dictionary = fopen(argv[1], "r");
bloom_filter_t *filter;
filter = get_filter_from_dict(dictionary);
fclose(dictionary);
FILE *text;
text = fopen(argv[2], "r");
int misspelled_words = 0;
char *word;
while ((word = next_word_in_text(text))) {
if (!bloom_filter_might_contain(filter, word)) {
misspelled_words++;
printf("%s\n", word);
}
free(word);
}
bloom_filter_destroy(filter);
fclose(text);
printf("Misspelled words: %d\n", misspelled_words);
return EXIT_SUCCESS;
}
bool is_word_char(char c) {
return isalpha(c) || c == '\'';
}
char *next_word_in_text(FILE *text) {
char *word_buffer = malloc(INITIAL_WORD_BUFFER_LEN);
int word_len = 0, buffer_size = INITIAL_WORD_BUFFER_LEN;
char c;
while (!is_word_char((c = fgetc(text))) && c != EOF); // Skip all invalid chars
while (is_word_char(c)) {
word_buffer[word_len++] = tolower(c);
if (buffer_size == word_len) {
buffer_size *= 2;
word_buffer = realloc(word_buffer, buffer_size);
}
c = fgetc(text);
}
if (word_len == 0 && c == EOF) {
free(word_buffer);
return NULL;
}
word_buffer[word_len] = '\0';
return word_buffer;
}
char *next_word_in_dict(FILE *dict) {
char *word_buffer = malloc(INITIAL_WORD_BUFFER_LEN);
int word_len = 0, buffer_size = INITIAL_WORD_BUFFER_LEN;
char c;
while ((c = fgetc(dict)) != '\n' && c != EOF) {
word_buffer[word_len++] = c;
if (word_len == buffer_size) {
buffer_size *= 2;
word_buffer = realloc(word_buffer, buffer_size);
}
}
if (word_len == 0 && c == EOF) {
free(word_buffer);
return NULL;
}
word_buffer[word_len] = '\0';
return word_buffer;
}
bloom_filter_t *get_filter_from_dict(FILE *dict) {
bloom_filter_t *filter = bloom_filter_create(ESTIMATED_DICT_SIZE, FALSE_POSITIVE_PROBABILITY);
char *word;
while ((word = next_word_in_dict(dict))) {
bloom_filter_add(filter, word);
free(word);
}
return filter;
}