Week 11 Contribute to Pytorch-NLP
This week, xiaolei wang and I were managing to fix the issue7 in Pytorch-NLP project.
Following the reference given by the author https://github.com/eladhoffer/seq2seq.pytorch/blob/master/seq2seq/tools/tokenizer.py , We finally make our small step of pull request to Free Open Source Software.
This is the first time I can finish some parts of FOSS, and pull request !!!, many thanks to FOSS class
Core Steps
Some procedure of making a pull request
- first step is to fork the upstream repo into your own work space. & always keep consistent with the upstream repo https://blog.csdn.net/matrix_google/article/details/80676034
- Second write your own code and push them to your own repo(forked from upstream repo).
- Last but not least, when you get every thing ready make the pull request & mention the corresponding issue or even comment to the author.
- Finally keep focus on your pr, make sure it pass all test & get a excellent coverage.
BytePair Encoding PR Experience Summary
Xiaolei wang provide me reasonable interface, and made me clear about what I should do next, I finish the tokenizer which is the focal part of the encoder and write certain test for it.
We also confronted many challenges, you could see them for 6 times failures on the test.
For comphehension local test we should use :
RUN_DOCS=true RUN_SLOW=true RUN_FLAKE8=true bash build_tools/travis/test_script.sh
first is the PEP8 format check, such as more than 100 characters, etc, the author write its confine of format in file .flake8.
[flake8] ignore = E402, E722, E731, E741, W504 max-line-length = 100 exclude = examples/
Second is about python 3.5 is not a steady version of python. It is shown here, our next two test failed on python3.5 while pass on python 3.6 & 3.7.
Finally, xiaolei wang found out what’s going on, and finally pass the test on the cloud.
Last about our code
Major code is shown as follows:
./PyTorch-NLP/torchnlp/encoders/text/bpe_encoder.py : you can find the final version in xiaolei wang’s report or in github.
./PyTorch-NLP/torchnlp/encoders/text/bpe_text_tokenizer.py
import codecs
from subword_nmt import learn_bpe, apply_bpe
from collections import Counter
from sacremoses import MosesTokenizer, MosesDetokenizer
class BPETextTokenizer(object):
_moses_tok = MosesTokenizer(lang='en')
_moses_detok = MosesDetokenizer(lang='en')
def __init__(self, file_prefix=None, separator='@@'):
if file_prefix is not None:
self.codes_file = '{}.vocab'.format(file_prefix)
self.separator = separator
self.bpe = None
self.vocab = None
@staticmethod
def pre_tokenize(line):
return BPETextTokenizer._moses_tok.tokenize(line, return_str=True)
@staticmethod
def _segment_words(line, pre_apply=None):
if pre_apply is not None:
line = pre_apply(line)
line = str(line)
return line.strip('\r\n ').split()
@staticmethod
def get_vocabulary(item_list, segment=_segment_words, from_filenames=True):
vocab = Counter()
if from_filenames:
for fname in item_list:
with codecs.open(fname, encoding='UTF-8') as f:
for line in f:
for word in segment(line):
vocab[word] += 1
else:
for line in item_list:
for word in segment(line):
vocab[word] += 1
return vocab
def build_from_corpus(self, item_list, min_count=2, num_symbols=10000,
total_symbols=False, from_filenames=True):
def segment_words(line):
return self._segment_words(line, self.pre_tokenize)
vocab_words = self.get_vocabulary(item_list, segment_words, from_filenames=from_filenames)
vocab_list = ['{0} {1}'.format(key, freq)
for (key, freq) in vocab_words.items()]
with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
learn_bpe.learn_bpe(vocab_list, output, num_symbols=num_symbols,
min_frequency=min_count, verbose=False,
is_dict=True, total_symbols=total_symbols)
with codecs.open(self.codes_file, encoding='UTF-8') as codes:
self.bpe = apply_bpe.BPE(codes, separator=self.separator)
self.vocab = dict(self.get_vocabulary(item_list=item_list, segment=self.segment,
from_filenames=from_filenames))
def segment(self, line):
if not hasattr(self, 'bpe'):
raise NameError('Learn bpe first!')
line = self.pre_tokenize(line)
return self.bpe.segment(line.strip('\r\n ')).split(' ')
def encode(self, raw_text):
return self.segment(raw_text)
def decode(self, bpe_text, delimiter=' '):
decode_string = delimiter.join(bpe_text)
try:
decode_string = decode_string.decode('utf-8')
except Exception:
pass
decode_string = decode_string \
.replace(self.separator + ' ', '') \
.replace(self.separator, '')
decode_string = str(decode_string).strip('\r\n ').split()
decode_string = self._moses_detok.tokenize(decode_string)
return decode_string
Uint Test
./PyTorch-NLP/tests/encoders/text/test_bytepair_encoder.py : shown in xiaolei wang’s report / GitHub.
./PyTorch-NLP/tests/encoders/text/test_bytepair_tokenizer.py
```python
import unittest
import picklefrom torchnlp.encoders.text.bpe_text_tokenizer import BPETextTokenizer
class TestBPETextTokenizer(unittest.TestCase):
def setUp(self):
self.corpus = [
"One morning I shot an elephant in my pajamas. How he got in my pajamas, I don't",
'know.', 'Groucho Marx',
"I haven't slept for 10 days... because that would be too long.", 'Mitch Hedberg'
]
def test_pre_tokenizer(self):
expected = ['One morning I shot an elephant in my pajamas . How he got in my pajamas ,'
' I don 't',
'know .',
'Groucho Marx',
'I haven 't slept for 10 days ... because that would be too long .',
'Mitch Hedberg']
self.assertListEqual(expected, [BPETextTokenizer.pre_tokenize(sen) for sen in self.corpus])
def test_get_vocabulary(self):
# tokenizer = BPETextTokenizer('test_bpe', use_moses=True)
def segment_words(line):
return BPETextTokenizer._segment_words(line, BPETextTokenizer.pre_tokenize)
token_counts = BPETextTokenizer.get_vocabulary(self.corpus,
segment_words, from_filenames=False)
expected = {
"'t": 2,
".": 3,
"...": 1,
"Groucho": 1,
"Marx": 1,
"Mitch": 1,
"Hedberg": 1,
"I": 3,
"in": 2,
"my": 2,
"know": 1,
"because": 1,
"pajamas": 2,
}
self.assertDictContainsSubset(expected, token_counts)
def test_learn_bpe(self):
tokenizer = BPETextTokenizer('test_bpe')
tokenizer.build_from_corpus(self.corpus, from_filenames=False)
expected = {('&', 'apos;t</w>'): 21, ('a', 'pos;t</w>'): 20, ('b', 'e'): 19,
('i', 'n</w>'): 18, ('le', 'p'): 17, ('l', 'e'): 16, ('m', 'y</w>'): 15,
('n', 'g</w>'): 14, ('o', 't</w>'): 13, ('o', 'u'): 12, ('o', 'w</w>'): 11,
('pajama', 's</w>'): 10, ('pajam', 'a'): 9, ('paja', 'm'): 8, ('paj', 'a'): 7,
('pa', 'j'): 6, ('p', 'a'): 5, ('po', 's;t</w>'): 4, ('p', 'o'): 3,
('s;', 't</w>'): 2, ('s', ';'): 1, ('h', 'a'): 0}
self.assertDictEqual(expected, tokenizer.bpe.bpe_codes)
def test_encode_decode(self):
corpus = ['This is a corpus of text that provides a bunch of tokens from which ',
'to build a vocabulary. It will be used when strings are encoded ',
'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.']
original = 'This is a coded sentence encoded by the SubwordTextTokenizer.'
tokenizer = BPETextTokenizer('test_bpe')
tokenizer.build_from_corpus(corpus, from_filenames=False)
# Encoding should be reversible.
encoded = tokenizer.encode(original)
decoded = tokenizer.decode(encoded)
self.assertEqual(original, decoded)
# The substrings coded@@ and en@@ are frequent enough in the corpus that
# they should appear in the vocabulary even though they are substrings
# of other included strings.
subtoken_strings = encoded
self.assertIn('en@@', subtoken_strings)
self.assertIn('code@@', subtoken_strings)
def test_build_vocab(self):
tokenizer = BPETextTokenizer('test_bpe')
tokenizer.build_from_corpus(self.corpus, from_filenames=False)
# test the all item in vocab.
expect = {'O@@': 1, 'n@@': 4, 'e': 4, 'm@@': 1, 'o@@': 5, 'r@@': 4, 'i@@': 2,
'ng': 2, 'I': 3, 's@@': 3, 'h@@': 3, 'ot': 2, 'a@@': 4, 'n': 3,
'e@@': 3, 'lep@@': 2, 'ha@@': 3, 't': 3, 'in': 2, 'my': 2,
'pajamas': 2, '.': 4, 'H@@': 2, 'ow': 2, 'g@@': 1, ',': 1, 'd@@': 3,
''t': 2, 'k@@': 1, 'G@@': 1, 'ou@@': 2, 'c@@': 3, 'o': 2,
'M@@': 2, 'x': 1, 'v@@': 1, 'f@@': 1, 'r': 1, '1@@': 1, '0': 1,
'y@@': 1, 's': 1, '.@@': 2, 'be@@': 2, 'u@@': 1, 't@@': 3,
'w@@': 1, 'l@@': 2, 'd': 1, 'b@@': 1, 'h': 1, 'g': 1}
self.assertDictEqual(expect, tokenizer.vocab)
def test_is_pickleable():
tokenizer = BPETextTokenizer('test_bpe')
pickle.dumps(tokenizer)
```