PyTorch Tutorial RNN 2 ========================= :date: 22 oct 2019 `download data `__ .. code:: python from io import open;import re;import unicodedata import numpy as np;import random class Language: def __init__(self): self.word2index={}; self.word2count={} self.index2word={0:"SOS",1:"EOS"};self.nWords=2 def addSentence(self,sentence): for word in sentence.split(' '): if word not in self.word2index: self.word2index[word]=self.nWords self.word2count[word]=1 self.index2word[self.nWords]=word self.nWords+=1 else: self.word2count[word]+=1 class GetTranslation: def __init__(self,file,maxLen=10): self.maxLen=maxLen lines=open(file,encoding='utf-8').read().strip().split('\n') pairs=[[self.normalizeString(s) for s in line.split('\t')] for line in lines] self.lang1=Language(); self.lang2=Language() self.pairs=self.filterPairs(pairs);print("len(pairs)=",len(pairs)) for pair in self.pairs: self.lang1.addSentence(pair[0]) self.lang2.addSentence(pair[1]) print("lang1",self.lang1.nWords);print("lang2",self.lang2.nWords) def normalizeString(self,s): s=self.unicodeToAscii(s.lower().strip()) s=re.sub(r"([.!?])",r" \1",s);s=re.sub(r"[^a-zA-Z.!?]+",r" ",s) return s def unicodeToAscii(self,s): return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c)!='Mn') def filterPairs(self,pairs): return [pair for pair in pairs if self.keepPair(pair)] def keepPair(self,pair): englishPrefixes = ("i am ", "i m ","he is", "he s ", "she is", "she s ","you are", "you re ", "we are", "we re ","they are", "they re ") return len(pair[0].split(' '))