文本核心

! [ -e /content ] && pip install -Uqq fastai  # 在Colab上升级fastai

from __future__ import annotations
from fastai.torch_basics import *
from fastai.data.all import *

from nbdev.showdoc import *

::: {#cell-4 .cell 0=‘d’ 1=‘e’ 2=‘f’ 3=‘a’ 4=‘u’ 5=‘l’ 6=‘t’ 7=’_’ 8=‘e’ 9=‘x’ 10=‘p’ 11=’ ’ 12=‘t’ 13=‘e’ 14=‘x’ 15=‘t’ 16=‘.’ 17=‘c’ 18=‘o’ 19=‘r’ 20=‘e’}

### 默认类级别 3

:::

在将文本组装到 DataLoaders 之前进行预处理的基本功能。

::: {#cell-6 .cell 0=‘e’ 1=‘x’ 2=‘p’ 3=‘o’ 4=‘r’ 5=‘t’}

import html

:::

预处理规则

以下是应用于文本在进行标记化之前或之后的规则。

#特殊标记
UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxfld xxrep xxwrep xxup xxmaj".split()

_all_ = ["UNK", "PAD", "BOS", "EOS", "FLD", "TK_REP", "TK_WREP", "TK_UP", "TK_MAJ"]

_re_spec = re.compile(r'([/#\\])')

def spec_add_spaces(t):
    "Add spaces around / and #"
    return _re_spec.sub(r' \1 ', t)

test_eq(spec_add_spaces('#fastai'), ' # fastai')
test_eq(spec_add_spaces('/fastai'), ' / fastai')
test_eq(spec_add_spaces('\\fastai'), ' \\ fastai')

_re_space = re.compile(' {2,}')

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return _re_space.sub(' ', t)

test_eq(rm_useless_spaces('a  b   c'), 'a b c')

_re_rep = re.compile(r'(\S)(\1{2,})')

def replace_rep(t):
    "Replace repetitions at the character level: cccc -- TK_REP 4 c"
    def _replace_rep(m):
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    return _re_rep.sub(_replace_rep, t)

它开始在相同字符重复三次或更多时进行替换。

test_eq(replace_rep('aa'), 'aa')
test_eq(replace_rep('aaaa'), f' {TK_REP} 4 a ')

_re_wrep = re.compile(r'(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)')

"""
Matches any word repeated at least four times with spaces between them
(?:\s|^)       Non-Capture either a whitespace character or the beginning of text
(\w+)          Capture any alphanumeric character
\s+            One or more whitespace
((?:\1\s+)+)   Capture a repetition of one or more times \1 followed by one or more whitespace
\1             Occurrence of \1
(\s|\W|$)      Capture last whitespace, non alphanumeric character or end of text
""";

def replace_wrep(t):
    "Replace word repetitions: word word word word -- TK_WREP 4 word"
    def _replace_wrep(m):
        c,cc,e = m.groups()
        return f' {TK_WREP} {len(cc.split())+2} {c} {e}'
    return _re_wrep.sub(_replace_wrep, t)

它从相同单词出现3次或更多时开始替换。

test_eq(replace_wrep('ah ah'), 'ah ah')
test_eq(replace_wrep('ah ah ah'), f' {TK_WREP} 3 ah ')
test_eq(replace_wrep('ah ah   ah  ah'), f' {TK_WREP} 4 ah ')
test_eq(replace_wrep('ah ah ah ah '), f' {TK_WREP} 4 ah  ')
test_eq(replace_wrep('ah ah ah ah.'), f' {TK_WREP} 4 ah .')
test_eq(replace_wrep('ah ah ahi'), f'ah ah ahi')

def fix_html(x):
    "Various messy things we've seen in documents"
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace('nbsp;', ' ').replace(
        '#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace('<br />', "\n").replace(
        '\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(' @-@ ','-').replace('...',' …')
    return html.unescape(x)

test_eq(fix_html('#39;bli#146;'), "'bli'")
test_eq(fix_html('Sarah amp; Duck...'), 'Sarah & Duck …')
test_eq(fix_html('a nbsp; #36;'), 'a   $')
test_eq(fix_html('\\" <unk>'), f'" {UNK}')
test_eq(fix_html('quot;  @.@  @-@ '), "' .-")
test_eq(fix_html('<br />text\\n'), '\ntext\n')

_re_all_caps = re.compile(r'(\s|^)([A-Z]+[^a-z\s]*)(?=(\s|$))')

"""
Catches any word in all caps, even with ' or - inside
(\s|^)        Capture either a whitespace or the beginning of text
([A-Z]+       Capture one capitalized letter or more...
[^a-z\s]*)    ...followed by anything that's non lowercase or whitespace
(?=(\s|$))    Look ahead for a space or end of text
""";

def replace_all_caps(t):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    def _replace_all_caps(m):
        tok = f'{TK_UP} ' if len(m.groups()[1]) > 1 else ''
        return f"{m.groups()[0]}{tok}{m.groups()[1].lower()}"
    return _re_all_caps.sub(_replace_all_caps, t)

test_eq(replace_all_caps("I'M SHOUTING"), f"{TK_UP} i'm {TK_UP} shouting")
test_eq(replace_all_caps("I'm speaking normally"), "I'm speaking normally")
test_eq(replace_all_caps("I am speaking normally"), "i am speaking normally")

_re_maj = re.compile(r'(\s|^)([A-Z][^A-Z\s]*)(?=(\s|$))')

"""
Catches any capitalized word
(\s|^)       Capture either a whitespace or the beginning of text
([A-Z]       Capture exactly one capitalized letter...
[^A-Z\s]*)   ...followed by anything that's not uppercase or whitespace
(?=(\s|$))   Look ahead for a space of end of text
""";

def replace_maj(t):
    "Replace tokens in Sentence Case by their lower version and add `TK_MAJ` before."
    def _replace_maj(m):
        tok = f'{TK_MAJ} ' if len(m.groups()[1]) > 1 else ''
        return f"{m.groups()[0]}{tok}{m.groups()[1].lower()}"
    return _re_maj.sub(_replace_maj, t)

test_eq(replace_maj("Jeremy Howard"), f'{TK_MAJ} jeremy {TK_MAJ} howard')
test_eq(replace_maj("I don't think there is any maj here"), ("i don't think there is any maj here"),)

def lowercase(t, add_bos=True, add_eos=False):
    "Converts `t` to lowercase"
    return (f'{BOS} ' if add_bos else '') + t.lower().strip() + (f' {EOS}' if add_eos else '')

def replace_space(t):
    "Replace embedded spaces in a token with unicode line char to allow for split/join"
    return t.replace(' ', '▁')

defaults.text_spec_tok = [UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ]
defaults.text_proc_rules = [fix_html, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces,
                            replace_all_caps, replace_maj, lowercase]
defaults.text_postproc_rules = [replace_space]

分词

一个分词器是一个必须实现__call__的方法的类。该方法接收一个文本迭代器，并必须返回一个生成器，包含它们的分词版本。这里是最基本的例子：

class BaseTokenizer():
    "Basic tokenizer that just splits on spaces"
    def __init__(self, split_char=' ', **kwargs): self.split_char=split_char
    def __call__(self, items): return (t.split(self.split_char) for t in items)

tok = BaseTokenizer()
test_eq(tok(["This is a text"]), [["This", "is", "a", "text"]])
tok = BaseTokenizer('x')
test_eq(tok(["This is a text"]), [["This is a te", "t"]])

class SpacyTokenizer():
    "Spacy tokenizer for `lang`"
    def __init__(self, lang='en', special_toks=None, buf_sz=5000):
        import spacy
        from spacy.symbols import ORTH
        self.special_toks = ifnone(special_toks, defaults.text_spec_tok)
        nlp = spacy.blank(lang)
        for w in self.special_toks: nlp.tokenizer.add_special_case(w, [{ORTH: w}])
        self.pipe,self.buf_sz = nlp.pipe,buf_sz

    def __call__(self, items):
        return (L(doc).attrgot('text') for doc in self.pipe(map(str,items), batch_size=self.buf_sz))

WordTokenizer = SpacyTokenizer

tok = SpacyTokenizer()
inp,exp = "This isn't the easiest text.",["This", "is", "n't", "the", "easiest", "text", "."]
test_eq(L(tok([inp,inp])), [exp,exp])

class TokenizeWithRules:
    "A wrapper around `tok` which applies `rules`, then tokenizes, then applies `post_rules`"
    def __init__(self, tok, rules=None, post_rules=None):
        self.rules = L(ifnone(rules, defaults.text_proc_rules))
        self.post_f = compose(*L(ifnone(post_rules, defaults.text_postproc_rules)))
        self.tok = tok

    def __call__(self, batch):
        return (L(o).map(self.post_f) for o in self.tok(maps(*self.rules, batch)))

f = TokenizeWithRules(BaseTokenizer(),rules=[replace_all_caps])
test_eq(f(["THIS isn't a problem"]), [[TK_UP, 'this', "isn't", 'a', 'problem']])
f = TokenizeWithRules(SpacyTokenizer())
test_eq(f(["This isn't a problem"]), [[BOS, TK_MAJ, 'this', 'is', "n't", 'a', 'problem']])
f = TokenizeWithRules(BaseTokenizer(split_char="'"), rules=[])
test_eq(f(["This isn't a problem"]), [['This▁isn', 't▁a▁problem']])

在处理分词的过程中，将被调用的主要函数。它将遍历文本的batch，应用rules并对其进行分词。

texts = ["this is a text", "this is another text"]
tok = TokenizeWithRules(BaseTokenizer(), texts.__getitem__)
test_eq(tok([0,1]), [['this', 'is', 'a', 'text'],['this', 'is', 'another', 'text']])

@delegates(TokenizeWithRules)
def tokenize1(text, tok, **kwargs):
    "Call `TokenizeWithRules` with a single text"
    return first(TokenizeWithRules(tok=tok, **kwargs)([text]))

test_eq(tokenize1("This isn't a problem", SpacyTokenizer()),
        [BOS, TK_MAJ, 'this', 'is', "n't", 'a', 'problem'])
test_eq(tokenize1("This isn't a problem", tok=BaseTokenizer(), rules=[]),
        ['This',"isn't",'a','problem'])

def parallel_tokenize(items, tok=None, rules=None, n_workers=defaults.cpus, **kwargs):
    "Calls optional `setup` on `tok` before launching `TokenizeWithRules` using `parallel_gen"
    if tok is None: tok = WordTokenizer()
    if hasattr(tok, 'setup'): tok.setup(items, rules)
    return parallel_gen(TokenizeWithRules, items, tok=tok, rules=rules, n_workers=n_workers, **kwargs)

请注意，由于这在后台使用了 parallel_gen，返回的生成器包含索引和结果的元组。不能保证结果是按顺序返回的，因此如果您需要按顺序获取它们，应按元组的第一个项目（索引）进行排序。

res  = parallel_tokenize(['0 1', '1 2'], rules=[], n_workers=2)
idxs,toks = zip(*L(res).sorted(itemgetter(0)))
test_eq(toks, [['0','1'],['1','2']])

res1 = parallel_tokenize(['0 1', '1 2'], tok=BaseTokenizer(), rules=[], n_workers=0)
idxs1,toks1 = zip(*L(res1).sorted(itemgetter(0)))
test_eq(toks, toks1)

对文件中的文本进行分词

文本预处理函数，用于文件名中的文本。分词后的文本将以类似的方式保存在path的父文件夹中，以_tok为后缀的目录中（可以通过output_dir进行覆盖）。该目录为返回值。

fn_counter_pkl = 'counter.pkl'
fn_lengths_pkl = 'lengths.pkl'

def _tokenize_files(func, files, path, output_dir=None, output_names=None, n_workers=defaults.cpus, rules=None, tok=None,
                   encoding='utf8', skip_if_exists=False):
    "Tokenize text `files` in parallel using `n_workers`"
    if tok is None: tok = WordTokenizer()
    output_dir = Path(ifnone(output_dir, path.parent/f'{path.name}_tok'))
    if skip_if_exists and output_dir.exists(): return output_dir
    output_dir.mkdir(exist_ok=True)
    if output_names is None: output_names = L(output_dir/f.relative_to(path) for f in files)
    rules = partial(Path.read_text, encoding=encoding) + L(ifnone(rules, defaults.text_proc_rules.copy()))

    lengths,counter = {},Counter()
    for i,tok in parallel_tokenize(files, tok, rules, n_workers=n_workers):
        out = func(i,output_dir)
        out.mk_write(' '.join(tok), encoding=encoding)
        lengths[str(files[i].relative_to(path))] = len(tok)
        counter.update(tok)

    save_pickle(output_dir/fn_lengths_pkl, lengths)
    save_pickle(output_dir/fn_counter_pkl, counter)
    return output_dir

@delegates(_tokenize_files)
def tokenize_folder(path, extensions=None, folders=None, output_dir=None, skip_if_exists=True, **kwargs):
    "Tokenize text files in `path` in parallel using `n_workers`"
    path,extensions = Path(path),ifnone(extensions, ['.txt'])
    files = get_files(path, extensions=extensions, recurse=True, folders=folders)
    def _f(i,output_dir): return output_dir/files[i].relative_to(path)
    return _tokenize_files(_f, files, path, skip_if_exists=skip_if_exists, **kwargs)

结果将保存在 output_dir 中（默认为与 path 相同父目录中的一个文件夹，文件夹名称为 path.name 后加 _tok），其结构与 path 相同。给定文件的标记化文本将存储在 output_dir 中名称相同的文件中。此外，后缀为 .len 的文件包含标记数量，所有单词的计数存储在 output_dir/counter.pkl 中。

extensions 默认设置为 ['.txt']，除非您在 include 中指定文件夹列表，否则 path 中的所有文本文件都将被处理。rules（默认为 defaults.text_proc_rules）在文本进入标记化器之前应用于每个文本。

@delegates(_tokenize_files)
def tokenize_files(files, path, output_dir, output_names=None, **kwargs):
    "Tokenize text `files` in parallel using `n_workers`"
    if output_names is None: output_names = L(output_dir/f.relative_to(path) for f in files)
    def _f(i,output_dir): return output_dir/output_names[i]
    return _tokenize_files(_f, files, path, output_dir=output_dir, **kwargs)

在数据框中对文本进行分词

def _join_texts(df, mark_fields=False):
    "Join texts in row `idx` of `df`, marking each field with `FLD` if `mark_fields=True`"
    text_col = (f'{FLD} {1} ' if mark_fields else '' ) + df.iloc[:,0].astype(str)
    for i in range(1,len(df.columns)):
        text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df.iloc[:,i].astype(str)
    return text_col.values

texts = [f"This is an example of text {i}" for i in range(10)]
df = pd.DataFrame({'text': texts, 'text1': texts}, columns=['text', 'text1'])
col = _join_texts(df, mark_fields=True)    

for i in range(len(df)):
    test_eq(col[i], f'{FLD} 1 This is an example of text {i} {FLD} 2 This is an example of text {i}')

def tokenize_texts(texts, n_workers=defaults.cpus, rules=None, tok=None):
    "Tokenize `texts` in parallel using `n_workers`"
    rules = L(ifnone(rules, defaults.text_proc_rules.copy()))
    outputs = L(parallel_tokenize(texts, tok=tok, rules=rules, n_workers=n_workers)
               ).sorted().itemgot(1)
    return outputs

def tokenize_df(df, text_cols, n_workers=defaults.cpus, rules=None, mark_fields=None,
                tok=None, tok_text_col="text"):
    "Tokenize texts in `df[text_cols]` in parallel using `n_workers` and stores them in `df[tok_text_col]`"
    text_cols = [df.columns[c] if isinstance(c, int) else c for c in L(text_cols)]
    如果只有一列文本，#mark_fields 默认为 False；如果有多个列，则默认为 True。
    if mark_fields is None: mark_fields = len(text_cols)>1
    rules = L(ifnone(rules, defaults.text_proc_rules.copy()))
    texts = _join_texts(df[text_cols], mark_fields=mark_fields)
    outputs = L(parallel_tokenize(texts, tok, rules, n_workers=n_workers)
               ).sorted().itemgot(1)

    other_cols = df.columns[~df.columns.isin(text_cols)]
    res = df[other_cols].copy()
    res[tok_text_col] = outputs
    res[f'{tok_text_col}_length'] = [len(o) for o in outputs]
    return res,Counter(outputs.concat())

此函数返回一个新的数据框，其中包含相同的非文本列，一个名为text的列，包含经过标记化的文本，以及一个名为text_lengths的列，包含其相应的长度。它还返回一个所有见过的词汇计数器，以便迅速构建词汇表。

在进入标记化器之前，将应用rules（默认为defaults.text_proc_rules）到每个文本。如果未指定mark_fields，当只有一个文本列时默认为False，当有多个文本列时默认为True。在这种情况下，这些列中的文本将用FLD标记连接，后面跟着字段的编号。

def tokenize_csv(fname, text_cols, outname=None, n_workers=4, rules=None, mark_fields=None,
                 tok=None, header='infer', chunksize=50000):
    "Tokenize texts in the `text_cols` of the csv `fname` in parallel using `n_workers`"
    df = pd.read_csv(fname, header=header, chunksize=chunksize)
    outname = Path(ifnone(outname, fname.parent/f'{fname.stem}_tok.csv'))
    cnt = Counter()

    for i,dfp in enumerate(df):
        out,c = tokenize_df(dfp, text_cols, n_workers=n_workers, rules=rules,
                            mark_fields=mark_fields, tok=tok)
        out.text = out.text.str.join(' ')
        out.to_csv(outname, header=(None,header)[i==0], index=False, mode=('a','w')[i==0])
        cnt.update(c)

    save_pickle(outname.with_suffix('.pkl'), cnt)

def load_tokenized_csv(fname):
    "Utility function to quickly load a tokenized csv ans the corresponding counter"
    fname = Path(fname)
    out = pd.read_csv(fname)
    for txt_col in out.columns[1:-1]:
        out[txt_col] = tuple(out[txt_col].str.split(' '))
    return out,load_pickle(fname.with_suffix('.pkl'))

结果将写入一个新的csv文件，文件名为outname（默认为与fname相同，后缀为_tok.csv），并将具有与原始文件相同的标题、相同的非文本列，以及一个文本列和一个文本长度列，如tokenize_df中所述。

在进入分词器之前，将对每个文本应用rules（默认为defaults.text_proc_rules）。如果未指定mark_fields，则在只有一个文本列时默认为False，在有多个文本列时默认为True。在这种情况下，所有这些列中的文本将用FLD标记连接，后面跟着字段编号。

csv文件以header方式打开，并可选择按chunksize分块读取。如果传递了该参数，则每个块将独立处理，并保存到输出文件中，以节省内存使用。

def _prepare_texts(tmp_d):
    "Prepare texts in a folder struct in tmp_d, a csv file and returns a dataframe"
    path = Path(tmp_d)/'tmp'
    path.mkdir()
    for d in ['a', 'b', 'c']: 
        (path/d).mkdir()
        for i in range(5):
            with open(path/d/f'text{i}.txt', 'w') as f: f.write(f"This is an example of text {d} {i}")
    
    texts = [f"This is an example of text {d} {i}" for i in range(5) for d in ['a', 'b', 'c']]
    df = pd.DataFrame({'text': texts, 'label': list(range(15))}, columns=['text', 'label'])
    csv_fname = tmp_d/'input.csv'
    df.to_csv(csv_fname, index=False)
    return path,df,csv_fname

# 集成测试
with tempfile.TemporaryDirectory() as tmp_d:
    path,df,csv_fname = _prepare_texts(Path(tmp_d))
    #将令牌化作为文件夹
    tokenize_folder(path)
    outp = Path(tmp_d)/'tmp_tok'
    for d in ['a', 'b', 'c']: 
        p = outp/d
        for i in range(5):
            test_eq((p/f'text{i}.txt').read_text(), ' '.join([
                BOS, TK_MAJ, 'this', 'is', 'an', 'example', 'of', 'text', d, str(i) ]))
    cnt_a = load_pickle(outp/fn_counter_pkl)
    test_eq(cnt_a['this'], 15)
    test_eq(cnt_a['a'], 5)
    test_eq(cnt_a['0'], 3)
    
    #将内容分词为文件
    files = get_text_files(path)
    tokenize_files(files, path, output_dir=path/'d')
    for f in files: 
        test_eq((path/'d'/f.relative_to(path)).read_text(), ' '.join([
                BOS, TK_MAJ, 'this', 'is', 'an', 'example', 'of', 'text', f.parent.name, f.name[4]]))
    
    #将文本分词为独立文本
    out = tokenize_texts(df['text'].values)
    test_eq(out, [(outp/d/f'text{i}.txt').read_text().split(' ') for i in range(5) for d in ['a', 'b', 'c']])
    
    #将文本分词为数据框
    out,cnt_b = tokenize_df(df, text_cols='text')
    test_eq(list(out.columns), ['label', 'text', 'text_length'])
    test_eq(out['label'].values, df['label'].values)
    test_eq(list(out['text']), [(outp/d/f'text{i}.txt').read_text().split(' ') for i in range(5) for d in ['a', 'b', 'c']])
    test_eq(cnt_a, cnt_b)
    
    #将内容分词并保存为CSV文件 
    out_fname = Path(tmp_d)/'output.csv'
    tokenize_csv(csv_fname, text_cols='text', outname=out_fname)
    a,b = load_tokenized_csv(out_fname)
    test_eq((out,cnt_b), load_tokenized_csv(out_fname))

`标记器`-

class Tokenizer(Transform):
    "Provides a consistent `Transform` interface to tokenizers operating on `DataFrame`s and folders"
    input_types = (str, list, L, tuple, Path)
    def __init__(self, tok, rules=None, counter=None, lengths=None, mode=None, sep=' '):
        if isinstance(tok,type): tok=tok()
        store_attr('tok,counter,lengths,mode,sep')
        self.rules = defaults.text_proc_rules if rules is None else rules

    @classmethod
    @delegates(tokenize_df, keep=True)
    def from_df(cls, text_cols, tok=None, rules=None, sep=' ', **kwargs):
        if tok is None: tok = WordTokenizer()
        res = cls(tok, rules=rules, mode='df')
        res.kwargs,res.train_setup = merge({'tok': tok}, kwargs),False
        res.text_cols,res.sep = text_cols,sep
        default_val = inspect.signature(tokenize_df).parameters['tok_text_col'].default
        res.tok_text_col = kwargs.get('tok_text_col', default_val)
        return res

    @classmethod
    @delegates(tokenize_folder, keep=True)
    def from_folder(cls, path, tok=None, rules=None, **kwargs):
        path = Path(path)
        if tok is None: tok = WordTokenizer()
        output_dir = tokenize_folder(path, tok=tok, rules=rules, **kwargs)
        res = cls(tok, counter=load_pickle(output_dir/fn_counter_pkl),
                  lengths=load_pickle(output_dir/fn_lengths_pkl), rules=rules, mode='folder')
        res.path,res.output_dir = path,output_dir
        return res

    def setups(self, dsets):
        if not self.mode == 'df' or not isinstance(dsets.items, pd.DataFrame): return
        dsets.items,count = tokenize_df(dsets.items, self.text_cols, rules=self.rules, **self.kwargs)
        if self.counter is None: self.counter = count
        if self.lengths is None: self.lengths = dsets.items[f'{self.tok_text_col}_length'].values
        return dsets

    def encodes(self, o:Path):
        if self.mode=='folder' and str(o).startswith(str(self.path)):
            tok = self.output_dir/o.relative_to(self.path)
            return L(tok.read_text(encoding='UTF-8').split(' '))
        else: return self._tokenize1(o.read_text())

    def encodes(self, o:str): return self._tokenize1(o)
    def _tokenize1(self, o): return first(self.tok([compose(*self.rules)(o)]))

    def get_lengths(self, items):
        if self.lengths is None: return None
        if self.mode == 'df':
            if isinstance(items, pd.DataFrame) and f'{self.tok_text_col}_length' in items.columns:
                return items[f'{self.tok_text_col}_length'].values
        if self.mode == 'folder':
            try:
                res = [self.lengths[str(Path(i).relative_to(self.path))] for i in items]
                if len(res) == len(items): return res
            except: return None

    def decodes(self, o): return TitledStr(self.sep.join(o))

with tempfile.TemporaryDirectory() as tmp_d:
    path,df,csv_fname = _prepare_texts(Path(tmp_d))
    items = get_text_files(path)
    splits = RandomSplitter()(items)
    dsets = Datasets(items, [Tokenizer.from_folder(path)], splits=splits)
    print(dsets.train[0])
    
    dsets = Datasets(df, [Tokenizer.from_df('text')], splits=splits)
    print(dsets.train[0][0].text)

(['xxbos', 'xxmaj', 'this', 'is', 'an', 'example', 'of', 'text', 'b', '0'],)

('xxbos', 'xxmaj', 'this', 'is', 'an', 'example', 'of', 'text', 'c', '3')

tst = test_set(dsets, ['This is a test', 'this is another test'])
test_eq(tst, [(['xxbos', 'xxmaj', 'this','is','a','test'],), 
              (['xxbos','this','is','another','test'],)])

Sentencepiece

eu_langs = ["bg", "cs", "da", "de", "el", "en", "es", "et", "fi", "fr", "ga", "hr", "hu",
            "it","lt","lv","mt","nl","pl","pt","ro","sk","sl","sv"] # 所有欧洲语言

class SentencePieceTokenizer():#TODO: pass the special tokens symbol to sp
    "SentencePiece tokenizer for `lang`"
    def __init__(self, lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000,
                 model_type='unigram', char_coverage=None, cache_dir='tmp'):
        try: from sentencepiece import SentencePieceTrainer,SentencePieceProcessor
        except ImportError:
            raise Exception('sentencepiece module is missing: run `pip install sentencepiece!=0.1.90,!=0.1.91`')
        self.sp_model,self.cache_dir = sp_model,Path(cache_dir)
        self.vocab_sz,self.max_vocab_sz,self.model_type = vocab_sz,max_vocab_sz,model_type
        self.char_coverage = ifnone(char_coverage, 0.99999 if lang in eu_langs else 0.9998)
        self.special_toks = ifnone(special_toks, defaults.text_spec_tok)
        if sp_model is None: self.tok = None
        else:
            self.tok = SentencePieceProcessor()
            self.tok.Load(str(sp_model))
        os.makedirs(self.cache_dir, exist_ok=True)

    def _get_vocab_sz(self, raw_text_path):
        cnt = Counter()
        with open(raw_text_path, 'r') as f:
            for line in f.readlines():
                cnt.update(line.split())
                if len(cnt)//4 > self.max_vocab_sz: return self.max_vocab_sz
        res = len(cnt)//4
        while res%8 != 0: res+=1
        return max(res,29)

    def train(self, raw_text_path):
        "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
        from sentencepiece import SentencePieceTrainer
        vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz
        spec_tokens = ['\u2581'+s for s in self.special_toks]
        SentencePieceTrainer.Train(" ".join([
            f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}",
            f"--character_coverage={self.char_coverage} --model_type={self.model_type}",
            f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1 --minloglevel=2",
            f"--user_defined_symbols={','.join(spec_tokens)} --hard_vocab_limit=false"]))
        raw_text_path.unlink()
        return self.cache_dir/'spm.model'

    def setup(self, items, rules=None):
        from sentencepiece import SentencePieceProcessor
        if rules is None: rules = []
        if self.tok is not None: return {'sp_model': self.sp_model}
        raw_text_path = self.cache_dir/'texts.out'
        with open(raw_text_path, 'w') as f:
            for t in progress_bar(maps(*rules, items), total=len(items), leave=False):
                f.write(f'{t}\n')
        sp_model = self.train(raw_text_path)
        self.tok = SentencePieceProcessor()
        self.tok.Load(str(sp_model))
        return {'sp_model': sp_model}

    def __call__(self, items):
        if self.tok is None: self.setup(items)
        for t in items: yield self.tok.EncodeAsPieces(t)

SubwordTokenizer = SentencePieceTokenizer

texts = [f"This is an example of text {i}" for i in range(10)]
df = pd.DataFrame({'text': texts, 'label': list(range(10))}, columns=['text', 'label'])
out,cnt = tokenize_df(df, text_cols='text', tok=SentencePieceTokenizer(vocab_sz=34), n_workers=1)

with tempfile.TemporaryDirectory() as tmp_d:
    path,df,csv_fname = _prepare_texts(Path(tmp_d))
    items = get_text_files(path)
    splits = RandomSplitter()(items)
    tok = SentencePieceTokenizer(special_toks=[])
    dsets = Datasets(items, [Tokenizer.from_folder(path, tok=tok)], splits=splits)
    print(dsets.train[0][0])
    
with warnings.catch_warnings():
    dsets = Datasets(df, [Tokenizer.from_df('text', tok=tok)], splits=splits)
    print(dsets.train[0][0].text)

['▁xx', 'b', 'o', 's', '▁xx', 'm', 'a', 'j', '▁t', 'h', 'i', 's', '▁', 'i', 's', '▁a', 'n', '▁', 'ex', 'a', 'm', 'p', 'l', 'e', '▁', 'o', 'f', '▁t', 'ex', 't', '▁', 'b', '▁', '2']

['▁xx', 'b', 'o', 's', '▁xx', 'm', 'a', 'j', '▁t', 'h', 'i', 's', '▁', 'i', 's', '▁a', 'n', '▁', 'ex', 'a', 'm', 'p', 'l', 'e', '▁', 'o', 'f', '▁t', 'ex', 't', '▁a', '▁', '4']

/home/jhoward/miniconda3/lib/python3.8/site-packages/numpy/core/_asarray.py:102: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  return array(a, dtype, copy=False, order=order)

导出 -

from nbdev import nbdev_export
nbdev_export()

Converted 00_torch_core.ipynb.
Converted 01_layers.ipynb.
Converted 01a_losses.ipynb.
Converted 02_data.load.ipynb.
Converted 03_data.core.ipynb.
Converted 04_data.external.ipynb.
Converted 05_data.transforms.ipynb.
Converted 06_data.block.ipynb.
Converted 07_vision.core.ipynb.
Converted 08_vision.data.ipynb.
Converted 09_vision.augment.ipynb.
Converted 09b_vision.utils.ipynb.
Converted 09c_vision.widgets.ipynb.
Converted 10_tutorial.pets.ipynb.
Converted 10b_tutorial.albumentations.ipynb.
Converted 11_vision.models.xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_callback.core.ipynb.
Converted 13a_learner.ipynb.
Converted 13b_metrics.ipynb.
Converted 14_callback.schedule.ipynb.
Converted 14a_callback.data.ipynb.
Converted 15_callback.hook.ipynb.
Converted 15a_vision.models.unet.ipynb.
Converted 16_callback.progress.ipynb.
Converted 17_callback.tracker.ipynb.
Converted 18_callback.fp16.ipynb.
Converted 18a_callback.training.ipynb.
Converted 18b_callback.preds.ipynb.
Converted 19_callback.mixup.ipynb.
Converted 20_interpret.ipynb.
Converted 20a_distributed.ipynb.
Converted 21_vision.learner.ipynb.
Converted 22_tutorial.imagenette.ipynb.
Converted 23_tutorial.vision.ipynb.
Converted 24_tutorial.image_sequence.ipynb.
Converted 24_tutorial.siamese.ipynb.
Converted 24_vision.gan.ipynb.
Converted 30_text.core.ipynb.
Converted 31_text.data.ipynb.
Converted 32_text.models.awdlstm.ipynb.
Converted 33_text.models.core.ipynb.
Converted 34_callback.rnn.ipynb.
Converted 35_tutorial.wikitext.ipynb.
Converted 36_text.models.qrnn.ipynb.
Converted 37_text.learner.ipynb.
Converted 38_tutorial.text.ipynb.
Converted 39_tutorial.transformers.ipynb.
Converted 40_tabular.core.ipynb.
Converted 41_tabular.data.ipynb.
Converted 42_tabular.model.ipynb.
Converted 43_tabular.learner.ipynb.
Converted 44_tutorial.tabular.ipynb.
Converted 45_collab.ipynb.
Converted 46_tutorial.collab.ipynb.
Converted 50_tutorial.datablock.ipynb.
Converted 60_medical.imaging.ipynb.
Converted 61_tutorial.medical_imaging.ipynb.
Converted 65_medical.text.ipynb.
Converted 70_callback.wandb.ipynb.
Converted 71_callback.tensorboard.ipynb.
Converted 72_callback.neptune.ipynb.
Converted 73_callback.captum.ipynb.
Converted 74_callback.azureml.ipynb.
Converted 97_test_utils.ipynb.
Converted 99_pytorch_doc.ipynb.
Converted dev-setup.ipynb.
Converted index.ipynb.
Converted quick_start.ipynb.
Converted tutorial.ipynb.