I'm reading the first Harry Potter book as a UTF-8 file in Python (I've tried packages io
and codecs
), and the first word that is read, which is "harry" (lowercase because I first word tokenize the entire corpus with nltk
), is read as '\ufeffharry'
. I'm guessing this has to do with the encoding and perhaps because it's the first word of the sentence.
I'll include the code I'm using. The function on which focus should be put is process_file
, and perhaps _process_tokens
.
What can I do so that 'harry'
is read instead?
class NGramTrainer(object): START_SYMBOL = '</s>' def __init__(self, n):""" NGramTrainer constructor :param n: Size of grams""" # Size of grams self.n = n # Sentence starter that adds 'padding' to each sentence self.sentence_starter = [self.START_SYMBOL] * (n - 1) # Sentence tokens from corpus self.tokenized_sentences = None # Collection of n-gram counts (pos i corresponds to (i+1)-ngrams) self.ngrams = [defaultdict(int) for _ in range(n)] # For each word, its ID. self.word2id = {} # For each ID, its word self.id2word = {} # Vocabulary size: mumber of unique words. self.num_unique_words = 0 # Total number of words from corpus self.num_total_words = 0 def process_file(self, file):""" Processes the file f.""" with io.open(file, mode='r', encoding='utf-8') as f: text = f.read().encode('utf-8').decode().lower() try: self.tokenized_sentences = nltk.sent_tokenize(text) except LookupError: nltk.download('punkt') nltk.download('wordnet') nltk.download('omw-1.4') self.tokenized_sentences = nltk.sent_tokenize(text) # Process each sentence. for i in tqdm(range(len(self.tokenized_sentences)), desc="Processing corpus", colour='green'): tokenized_words = nltk.word_tokenize(self.tokenized_sentences[i]) tokenized_words = self.sentence_starter + tokenized_words self._process_tokens(tokenized_words) print(self.ngrams[0]) def _process_tokens(self, tokens):""" Processes the list of tokens, and adjusts the ngram counts. :param tokens: The list of tokens to be processed.""" # Update maps word2id & id2word. for token in tokens: if token not in self.word2id: self.word2id[token] = self.num_unique_words self.id2word[self.num_unique_words] = token self.num_unique_words += 1 self.num_total_words += len(tokens) # We will count one start symbol per sentence self.ngrams[0][(self.START_SYMBOL,)] += 1 # Iterate over all possible n-grams for i in range(self.n - 1, len(tokens)): # Obtain the n-gram stretching from pos i-n+1 to i --> interval [i-n+1, i+1) ngram = tuple(tokens[i - self.n + 1:i + 1]) # Update the count for each l-gram, l = 1, ..., n for k in range(self.n): # k = 0, ..., n-1 self.ngrams[k][ngram[self.n - 1 - k:i + 1]] += 1 def _get_stats(self):""" Returns the model statistics""" print("Harry", self.ngrams[0][('harry',)]) # Initial row rows = [str(self.num_unique_words) +" " + str(self.num_total_words)] # For each k-grams, print their stats for k in range(self.n): # Get the k-grams kgrams = self.ngrams[k] # Record how many lines are gonna follow rows.append(str(len(kgrams))) # For each kgram (tuple) in the kgrams dict for kgram in kgrams: # Transform the words into string ids ids = ''.join(str(self.word2id[word]) for word in kgram) # Compute the (log) probability # P(w_i | w_{i-n+1}, ..., w_{i-1}) = # c(w_{i-n+1}, ..., w_{i-1}, w_i) / c(w_{i-n+1}, ..., w_{i-1}) # Get the number of occurrences of this kgram kgram_count = kgrams[kgram] if k == 0: # Uni-gram --> Use log_prob for unigram_count log_prob = math.log(kgram_count) - math.log(self.num_total_words) ids += ''+ kgram[0] # Append word. else: # Dealing with 2, 3, ... -grams. # If the previous kgram doesn't exist (start symbols) if kgram[:-1] not in self.ngrams[k - 1]: log_prob = -float('inf') # So that e^(-inf) = 0 else: prev_kgram_count = self.ngrams[k - 1][kgram[:-1]] log_prob = math.log(kgram_count) - math.log(prev_kgram_count) log_prob = format(log_prob, '.15f') rows.append(ids +" " + str(kgram_count) +" " + str(log_prob)) rows.append(str(-1)) # EOF return rows def save_model(self, file):""" Save model stats in the provided file""" try: print("Saving model...") with io.open(file, mode='w', encoding='utf-8') as f: for row in self._get_stats(): f.write(row +'\n') print("Model saved!") except FileNotFoundError: print("The file", file, " was not found.") except IOError: print("An IOError occurred while saving the model.")