Weird first word read Python [duplicate]

I'm reading the first Harry Potter book as a UTF-8 file in Python (I've tried packages io and codecs), and the first word that is read, which is "harry" (lowercase because I first word tokenize the entire corpus with nltk), is read as '\ufeffharry'. I'm guessing this has to do with the encoding and perhaps because it's the first word of the sentence.

I'll include the code I'm using. The function on which focus should be put is process_file, and perhaps _process_tokens.

What can I do so that 'harry' is read instead?

class NGramTrainer(object):    START_SYMBOL = '</s>'    def __init__(self, n):"""        NGramTrainer constructor        :param n: Size of grams"""        # Size of grams        self.n = n        # Sentence starter that adds 'padding' to each sentence        self.sentence_starter = [self.START_SYMBOL] * (n - 1)        # Sentence tokens from corpus        self.tokenized_sentences = None        # Collection of n-gram counts (pos i corresponds to (i+1)-ngrams)        self.ngrams = [defaultdict(int) for _ in range(n)]        # For each word, its ID.        self.word2id = {}        # For each ID, its word        self.id2word = {}        # Vocabulary size: mumber of unique words.        self.num_unique_words = 0        # Total number of words from corpus        self.num_total_words = 0    def process_file(self, file):"""        Processes the file f."""        with io.open(file, mode='r', encoding='utf-8') as f:            text = f.read().encode('utf-8').decode().lower()        try:            self.tokenized_sentences = nltk.sent_tokenize(text)        except LookupError:            nltk.download('punkt')            nltk.download('wordnet')            nltk.download('omw-1.4')            self.tokenized_sentences = nltk.sent_tokenize(text)        # Process each sentence.        for i in tqdm(range(len(self.tokenized_sentences)),                      desc="Processing corpus", colour='green'):            tokenized_words = nltk.word_tokenize(self.tokenized_sentences[i])            tokenized_words = self.sentence_starter + tokenized_words            self._process_tokens(tokenized_words)        print(self.ngrams[0])    def _process_tokens(self, tokens):"""        Processes the list of tokens, and        adjusts the ngram counts.        :param tokens: The list of tokens to be processed."""        # Update maps word2id & id2word.        for token in tokens:            if token not in self.word2id:                self.word2id[token] = self.num_unique_words                self.id2word[self.num_unique_words] = token                self.num_unique_words += 1        self.num_total_words += len(tokens)        # We will count one start symbol per sentence        self.ngrams[0][(self.START_SYMBOL,)] += 1        # Iterate over all possible n-grams        for i in range(self.n - 1, len(tokens)):            # Obtain the n-gram stretching from pos i-n+1 to i --> interval [i-n+1, i+1)            ngram = tuple(tokens[i - self.n + 1:i + 1])            # Update the count for each l-gram, l = 1, ..., n            for k in range(self.n):  # k = 0, ..., n-1                self.ngrams[k][ngram[self.n - 1 - k:i + 1]] += 1    def _get_stats(self):"""        Returns the model statistics"""        print("Harry", self.ngrams[0][('harry',)])        # Initial row        rows = [str(self.num_unique_words) +" " + str(self.num_total_words)]        # For each k-grams, print their stats        for k in range(self.n):            # Get the k-grams            kgrams = self.ngrams[k]            # Record how many lines are gonna follow            rows.append(str(len(kgrams)))            # For each kgram (tuple) in the kgrams dict            for kgram in kgrams:                # Transform the words into string ids                ids = ''.join(str(self.word2id[word]) for word in kgram)                # Compute the (log) probability                # P(w_i | w_{i-n+1}, ..., w_{i-1}) =                # c(w_{i-n+1}, ..., w_{i-1}, w_i) / c(w_{i-n+1}, ..., w_{i-1})                # Get the number of occurrences of this kgram                kgram_count = kgrams[kgram]                if k == 0:  # Uni-gram --> Use log_prob for unigram_count                    log_prob = math.log(kgram_count) - math.log(self.num_total_words)                    ids += ''+ kgram[0]  # Append word.                else:  # Dealing with 2, 3, ... -grams.                    # If the previous kgram doesn't exist (start symbols)                    if kgram[:-1] not in self.ngrams[k - 1]:                        log_prob = -float('inf')  # So that e^(-inf) = 0                    else:                        prev_kgram_count = self.ngrams[k - 1][kgram[:-1]]                        log_prob = math.log(kgram_count) - math.log(prev_kgram_count)                log_prob = format(log_prob, '.15f')                rows.append(ids +" " + str(kgram_count) +" " + str(log_prob))        rows.append(str(-1))  # EOF        return rows    def save_model(self, file):"""        Save model stats in the provided file"""        try:            print("Saving model...")            with io.open(file, mode='w', encoding='utf-8') as f:                for row in self._get_stats():                    f.write(row +'\n')            print("Model saved!")        except FileNotFoundError:            print("The file", file, " was not found.")        except IOError:            print("An IOError occurred while saving the model.")

Weird first word read Python [duplicate]

Trending Articles

Bath man appears in court charged with attempted murder of a man...

MACLEAN, Allan

Black Angus Grilled Artichokes

Practice Sheet of Right form of verbs for HSC Students

Police blotter for Jan. 12

99 God Status for Whatsapp, Facebook

Rajasthan Board 12th Science Result 2018 name wise- RBSE 12th commerce result...

Notorious Naushad of Ippa gang nabbed

Child Kidnapping: Amy McNeil was kidnapped on her way to school by 5 adults;...

Sonible Smartlimit v1.1.5-R2R

NCERT Solutions for Class 9th Sanskrit Chapter 3 पाथेयम्

मतलबी दोस्त स्टेट्स | Matlabi Dost Status in Hindi – Selfish Friends Status

Arrow Flash 2 – Sinhala Dubbed – Episode 23 – 20th March 2016

[GET] AI Traffic Goldmine

[E² Plugin] HDF-Radio

Universal Multi-Patch v1.3 By RADIXX11

IWAN – Thanks and Praise ( Throw Back Thursday )

RONALD P SONDERGAARD Arrested by Miami-Dade County Corrections on Mar 03, 2017

मुख मैथुन से उठाएं सेक्स का भरपूर मज़ा, जानें क्या है इसका सही तरीकामुख मैथुन...

HSSC Excise & Taxation Inspector Result 2017 Scorecard/ Category Wise Merit List