Quantcast
Channel: Active questions tagged utf-8 - Stack Overflow
Viewing all articles
Browse latest Browse all 1135

Weird first word read Python [duplicate]

$
0
0

I'm reading the first Harry Potter book as a UTF-8 file in Python (I've tried packages io and codecs), and the first word that is read, which is "harry" (lowercase because I first word tokenize the entire corpus with nltk), is read as '\ufeffharry'. I'm guessing this has to do with the encoding and perhaps because it's the first word of the sentence.

I'll include the code I'm using. The function on which focus should be put is process_file, and perhaps _process_tokens.

What can I do so that 'harry' is read instead?

class NGramTrainer(object):    START_SYMBOL = '</s>'    def __init__(self, n):"""        NGramTrainer constructor        :param n: Size of grams"""        # Size of grams        self.n = n        # Sentence starter that adds 'padding' to each sentence        self.sentence_starter = [self.START_SYMBOL] * (n - 1)        # Sentence tokens from corpus        self.tokenized_sentences = None        # Collection of n-gram counts (pos i corresponds to (i+1)-ngrams)        self.ngrams = [defaultdict(int) for _ in range(n)]        # For each word, its ID.        self.word2id = {}        # For each ID, its word        self.id2word = {}        # Vocabulary size: mumber of unique words.        self.num_unique_words = 0        # Total number of words from corpus        self.num_total_words = 0    def process_file(self, file):"""        Processes the file f."""        with io.open(file, mode='r', encoding='utf-8') as f:            text = f.read().encode('utf-8').decode().lower()        try:            self.tokenized_sentences = nltk.sent_tokenize(text)        except LookupError:            nltk.download('punkt')            nltk.download('wordnet')            nltk.download('omw-1.4')            self.tokenized_sentences = nltk.sent_tokenize(text)        # Process each sentence.        for i in tqdm(range(len(self.tokenized_sentences)),                      desc="Processing corpus", colour='green'):            tokenized_words = nltk.word_tokenize(self.tokenized_sentences[i])            tokenized_words = self.sentence_starter + tokenized_words            self._process_tokens(tokenized_words)        print(self.ngrams[0])    def _process_tokens(self, tokens):"""        Processes the list of tokens, and        adjusts the ngram counts.        :param tokens: The list of tokens to be processed."""        # Update maps word2id & id2word.        for token in tokens:            if token not in self.word2id:                self.word2id[token] = self.num_unique_words                self.id2word[self.num_unique_words] = token                self.num_unique_words += 1        self.num_total_words += len(tokens)        # We will count one start symbol per sentence        self.ngrams[0][(self.START_SYMBOL,)] += 1        # Iterate over all possible n-grams        for i in range(self.n - 1, len(tokens)):            # Obtain the n-gram stretching from pos i-n+1 to i --> interval [i-n+1, i+1)            ngram = tuple(tokens[i - self.n + 1:i + 1])            # Update the count for each l-gram, l = 1, ..., n            for k in range(self.n):  # k = 0, ..., n-1                self.ngrams[k][ngram[self.n - 1 - k:i + 1]] += 1    def _get_stats(self):"""        Returns the model statistics"""        print("Harry", self.ngrams[0][('harry',)])        # Initial row        rows = [str(self.num_unique_words) +" " + str(self.num_total_words)]        # For each k-grams, print their stats        for k in range(self.n):            # Get the k-grams            kgrams = self.ngrams[k]            # Record how many lines are gonna follow            rows.append(str(len(kgrams)))            # For each kgram (tuple) in the kgrams dict            for kgram in kgrams:                # Transform the words into string ids                ids = ''.join(str(self.word2id[word]) for word in kgram)                # Compute the (log) probability                # P(w_i | w_{i-n+1}, ..., w_{i-1}) =                # c(w_{i-n+1}, ..., w_{i-1}, w_i) / c(w_{i-n+1}, ..., w_{i-1})                # Get the number of occurrences of this kgram                kgram_count = kgrams[kgram]                if k == 0:  # Uni-gram --> Use log_prob for unigram_count                    log_prob = math.log(kgram_count) - math.log(self.num_total_words)                    ids += ''+ kgram[0]  # Append word.                else:  # Dealing with 2, 3, ... -grams.                    # If the previous kgram doesn't exist (start symbols)                    if kgram[:-1] not in self.ngrams[k - 1]:                        log_prob = -float('inf')  # So that e^(-inf) = 0                    else:                        prev_kgram_count = self.ngrams[k - 1][kgram[:-1]]                        log_prob = math.log(kgram_count) - math.log(prev_kgram_count)                log_prob = format(log_prob, '.15f')                rows.append(ids +" " + str(kgram_count) +" " + str(log_prob))        rows.append(str(-1))  # EOF        return rows    def save_model(self, file):"""        Save model stats in the provided file"""        try:            print("Saving model...")            with io.open(file, mode='w', encoding='utf-8') as f:                for row in self._get_stats():                    f.write(row +'\n')            print("Model saved!")        except FileNotFoundError:            print("The file", file, " was not found.")        except IOError:            print("An IOError occurred while saving the model.")

Viewing all articles
Browse latest Browse all 1135

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>