文本数据

! [ -e /content ] && pip install -Uqq fastai  # 在Colab上升级fastai
from __future__ import annotations
from fastai.torch_basics import *
from fastai.data.all import *
from fastai.text.core import *
from nbdev.showdoc import *

::: {#cell-4 .cell 0=‘d’ 1=‘e’ 2=‘f’ 3=‘a’ 4=‘u’ 5=‘l’ 6=‘t’ 7=’_’ 8=‘e’ 9=‘x’ 10=‘p’ 11=’ ’ 12=‘t’ 13=‘e’ 14=‘x’ 15=‘t’ 16=‘.’ 17=‘d’ 18=‘a’ 19=‘t’ 20=‘a’}

### 默认类级别 3

:::

帮助在 Datasets 中收集文本数据的函数和转换。

反向

反转文本可以提高与前向模型的集成的准确性。所需的仅仅是一个 type_tfm,它将在文本输入时反转文本:

def reverse_text(x): return x.flip(0)
t = tensor([0,1,2])
r = reverse_text(t)
test_eq(r, tensor([2,1,0]))

数字化

数值化是将标记转换为整数的步骤。第一步是建立一个标记与索引的对应关系,这被称为词汇表。

def make_vocab(count, min_freq=3, max_vocab=60000, special_toks=None):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    special_toks = ifnone(special_toks, defaults.text_spec_tok)
    for o in reversed(special_toks): #确保所有特殊标记都在词汇表中
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    return vocab + [f'xxfake' for i in range(0, 8-len(vocab)%8)]

如果令牌超过 max_vocab,则保留的令牌是出现频率最高的。

Note

为了提高混合精度下的性能,词汇表的大小总是包含8的倍数,这可能通过添加 xxfake 令牌来实现。

count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set([x for x in make_vocab(count) if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'a'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set([x for x in make_vocab(count, min_freq=1) if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'a b c d'.split()))
test_eq(set([x for x in make_vocab(count,max_vocab=12, min_freq=1) if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'a b c'.split()))
class TensorText(TensorBase):   pass
class LMTensorText(TensorText): pass

TensorText.__doc__ = "Semantic type for a tensor representing text"
LMTensorText.__doc__ = "Semantic type for a tensor representing text in language modeling"
class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, special_toks=None):
        store_attr('vocab,min_freq,max_vocab,special_toks')
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})

    def setups(self, dsets):
        if dsets is None: return
        if self.vocab is None:
            count = dsets.counter if getattr(dsets, 'counter', None) is not None else Counter(p for o in dsets for p in o)
            if self.special_toks is None and hasattr(dsets, 'special_toks'):
                self.special_toks = dsets.special_toks
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab, special_toks=self.special_toks)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o): return TensorText(tensor([self.o2i  [o_] for o_ in o]))
    def decodes(self, o): return L(self.vocab[o_] for o_ in o)
num = Numericalize(min_freq=2)
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
start = 'This is an example of text '

如果没有传递 vocab,则在设置时从数据中创建一个,使用 make_vocab 结合 min_freqmax_vocab

start = 'This is an example of text'
num = Numericalize(min_freq=1)
num.setup(L(start.split(), 'this is another text'.split()))
test_eq(set([x for x in num.vocab if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'This is an example of text this another'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start.split())

test_eq(t, tensor([11, 9, 12, 13, 14, 10]))
test_eq(num.decode(t), start.split())
num = Numericalize(min_freq=2)
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set([x for x in num.vocab if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'is text'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start.split())
test_eq(t, tensor([0, 9, 0, 0, 0, 10]))
test_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text'.split())
df = pd.DataFrame({'texts': ['This is an example of text', 'this is another text']})
tl = TfmdLists(df, [attrgetter('text'), Tokenizer.from_df('texts'), Numericalize(min_freq=2)])
test_eq(tl, [tensor([2, 8, 9, 10, 0, 0, 0, 11]), tensor([2, 9, 10,  0, 11])])

LM_DataLoader -

def _maybe_first(o): return o[0] if isinstance(o, tuple) else o
def _get_tokenizer(ds):
    tok = getattr(ds, 'tokenizer', None)
    if isinstance(tok, Tokenizer): return tok
    if isinstance(tok, (list,L)):
        for t in tok:
            if isinstance(t, Tokenizer): return t
def _get_lengths(ds):
    tok = _get_tokenizer(ds)
    if tok is None: return
    return tok.get_lengths(ds.items)
#待办事项:添加反向功能
@delegates()
class LMDataLoader(TfmdDL):
    "A `DataLoader` suitable for language modeling"
    def __init__(self, dataset, lens=None, cache=2, bs=64, seq_len=72, num_workers=0, **kwargs):
        self.items = ReindexCollection(dataset, cache=cache, tfm=_maybe_first)
        self.seq_len = seq_len
        if lens is None: lens = _get_lengths(dataset)
        if lens is None: lens = [len(o) for o in self.items]
        self.lens = ReindexCollection(lens, idxs=self.items.idxs)
        # The "-1" is to allow for final label, we throw away the end that's less than bs
        corpus = round_multiple(sum(lens)-1, bs, round_down=True)
        self.bl = corpus//bs #bl 代表批量长度
        self.n_batches = self.bl//(seq_len) + int(self.bl%seq_len!=0)
        self.last_len = self.bl - (self.n_batches-1)*seq_len
        self.make_chunks()
        super().__init__(dataset=dataset, bs=bs, num_workers=num_workers, **kwargs)
        self.n = self.n_batches*bs

    def make_chunks(self): self.chunks = Chunks(self.items, self.lens)
    def shuffle_fn(self,idxs):
        self.items.shuffle()
        self.make_chunks()
        return idxs

    def create_item(self, seq):
        if seq is None: seq = 0
        if seq>=self.n: raise IndexError
        sl = self.last_len if seq//self.bs==self.n_batches-1 else self.seq_len
        st = (seq%self.bs)*self.bl + (seq//self.bs)*self.seq_len
        txt = self.chunks[st : st+sl+1]
        return LMTensorText(txt[:-1]),txt[1:]

    @delegates(TfmdDL.new)
    def new(self, dataset=None, seq_len=None, **kwargs):
        lens = self.lens.coll if dataset is None else None
        seq_len = self.seq_len if seq_len is None else seq_len
        return super().new(dataset=dataset, lens=lens, seq_len=seq_len, **kwargs)
show_doc(LMDataLoader, title_level=2)

class LMDataLoader[source]

LMDataLoader(dataset, lens=None, cache=2, bs=64, seq_len=72, num_workers=0, shuffle:bool=False, verbose:bool=False, do_setup:bool=True, pin_memory=False, timeout=0, batch_size=None, drop_last=False, indexed=None, n=None, device=None, persistent_workers=False, pin_memory_device='', wif=None, before_iter=None, after_item=None, before_batch=None, after_batch=None, after_iter=None, create_batches=None, create_item=None, create_batch=None, retain=None, get_idxs=None, sample=None, shuffle_fn=None, do_batch=None) :: TfmdDL

A DataLoader suitable for language modeling

dataset 应该是一个数值化文本的集合,以便这个函数能够正常工作。可以传递 lens 来优化创建,或者 LMDataLoader 将对 dataset 进行一次完整的遍历以计算它们。cache 用于避免不必要地重新加载项目。

LMDataLoader 将把所有文本(可能是经过 shuffle 的)连接成一大流,分割成 bs 个连贯的句子,然后一次处理这些 seq_len

隐藏
bs,sl = 4,3
ints = L([0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22]).map(tensor)
dl = LMDataLoader(ints, bs=bs, seq_len=sl)
list(dl)
test_eq(list(dl),
    [[tensor([[0, 1, 2], [5, 6, 7], [10, 11, 12], [15, 16, 17]]),
      tensor([[1, 2, 3], [6, 7, 8], [11, 12, 13], [16, 17, 18]])],
     [tensor([[3, 4], [8,  9], [13, 14], [18, 19]]),
      tensor([[4, 5], [9, 10], [14, 15], [19, 20]])]])
bs,sl = 4,3
ints = L([0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]).map(tensor)
dl = LMDataLoader(ints, bs=bs, seq_len=sl)
test_eq(list(dl),
    [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
      tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
     [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
      tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])
#检查镜头工作状态
dl = LMDataLoader(ints, lens=ints.map(len), bs=bs, seq_len=sl)
test_eq(list(dl),
    [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
      tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
     [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
      tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])
dl = LMDataLoader(ints, bs=bs, seq_len=sl, shuffle=True)
for x,y in dl: test_eq(x[:,1:], y[:,:-1])
((x0,y0), (x1,y1)) = tuple(dl)
#第二批从第一批结束的地方开始
test_eq(y0[:,-1], x1[:,0]) 
test_eq(type(x0), LMTensorText)
#测试新作品
dl = LMDataLoader(ints, bs=bs, seq_len=sl, shuffle=True)
dl1 = dl.new()
test_eq(dl1.seq_len, sl)
dl2 = dl.new(seq_len=2)
test_eq(dl2.seq_len, 2)

显示 -

@typedispatch
def show_batch(x: TensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    if trunc_at is not None: samples = L((s[0].truncate(trunc_at),*s[1:]) for s in samples)
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
    display_df(pd.DataFrame(ctxs))
    return ctxs
@typedispatch
def show_batch(x: LMTensorText, y, samples, ctxs=None, max_n=10, trunc_at=150, **kwargs):
    samples = L((s[0].truncate(trunc_at), s[1].truncate(trunc_at)) for s in samples)
    return show_batch[TensorText](x, None, samples, ctxs=ctxs, max_n=max_n, trunc_at=None, **kwargs)

分类

对于分类,我们通过使用填充来处理文本长度不一致的问题。

class Pad_Input(ItemTransform):
    def encodes(self,samples, pad_idx=1, pad_fields=0, pad_first=False, backwards=False):
        "Function that collect `samples` and adds padding"
        self.pad_idx = pad_idx
        pad_fields = L(pad_fields)
        max_len_l = pad_fields.map(lambda f: max([len(s[f]) for s in samples]))
        if backwards: pad_first = not pad_first
        def _f(field_idx, x):
            if field_idx not in pad_fields: return x
            idx = pad_fields.items.index(field_idx) #待办:如果L.index修复,请移除这些项目
            sl = slice(-len(x), sys.maxsize) if pad_first else slice(0, len(x))
            pad =  x.new_zeros(max_len_l[idx]-x.shape[0])+pad_idx
            x1 = torch.cat([pad, x] if pad_first else [x, pad])
            if backwards: x1 = x1.flip(0)
            return retain_type(x1, x)
        return [tuple(map(lambda idxx: _f(*idxx), enumerate(s))) for s in samples]
    def decodes(self, o:TensorText):
        pad_idx = self.pad_idx if hasattr(self,'pad_idx') else 1
        return o[o != pad_idx]
pad_input=Pad_Input()

pad_idx 用于填充,填充应用于样本的 pad_fields。如果 pad_firstTrue,则填充应用于开头;如果添加了 backwards,则张量会被翻转。

test_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0), 
        [(tensor([1,2,3]),1), (tensor([4,5,0]),2), (tensor([6,0,0]), 3)])
test_eq(pad_input([(tensor([1,2,3]), (tensor([6]))), (tensor([4,5]), tensor([4,5])), (tensor([6]), (tensor([1,2,3])))], pad_idx=0, pad_fields=1), 
        [(tensor([1,2,3]),(tensor([6,0,0]))), (tensor([4,5]),tensor([4,5,0])), ((tensor([6]),tensor([1, 2, 3])))])
test_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0, pad_first=True), 
        [(tensor([1,2,3]),1), (tensor([0,4,5]),2), (tensor([0,0,6]), 3)])
test_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0, backwards=True), 
        [(tensor([3,2,1]),1), (tensor([5,4,0]),2), (tensor([6,0,0]), 3)])
x = pad_input([(TensorText([1,2,3]),1), (TensorText([4,5]), 2), (TensorText([6]), 3)], pad_idx=0)
test_eq(x, [(tensor([1,2,3]),1), (tensor([4,5,0]), 2), (tensor([6,0,0]), 3)])
test_eq(pad_input.decode(x[1][0]), tensor([4,5]))
#检查保留类型
x = [(TensorText([1,2,3]),1), (TensorText([4,5]), 2), (TensorText([6]), 3)]
y = pad_input(x, pad_idx=0)
for s in y: test_eq(type(s[0]), TensorText)

x 使用 pad_idx 填充到长度 pad_len。如果 pad_first 为假,所有填充将追加到 x,直到 x 的长度为 pad_len。否则,如果 pad_first 为真,则大小为 seq_len 的块将被添加到 x 前面,剩余的填充将追加到 x

def pad_chunk(x,pad_idx=1, pad_first=True, seq_len=72, pad_len=10):
    "Pad `x` by adding padding by chunks of size `seq_len`"
    l = pad_len - x.shape[0]
    pad_chunk = x.new_zeros((l//seq_len) * seq_len) + pad_idx
    pad_res   = x.new_zeros(l % seq_len) + pad_idx
    x1 = torch.cat([pad_chunk, x, pad_res]) if pad_first else torch.cat([x, pad_chunk, pad_res])
    return retain_type(x1, x)
print('pad_first: ',pad_chunk(torch.tensor([1,2,3]),seq_len=3,pad_idx=0,pad_len=8))
print('pad_last:  ',pad_chunk(torch.tensor([1,2,3]),seq_len=3,pad_idx=0,pad_len=8,pad_first=False))
pad_first:  tensor([0, 0, 0, 1, 2, 3, 0, 0])
pad_last:   tensor([1, 2, 3, 0, 0, 0, 0, 0])

pad_input_chunk 是一个功能版本的 pad_chunk,它适用于列表的列表。

@delegates(pad_chunk)
def pad_input_chunk(samples, n_inp=1,**kwargs):
    "Pad `samples` by adding padding by chunks of size `seq_len`"
    max_len = max([len(s[n]) for s in samples for n in range(n_inp)])
    padeds = [[pad_chunk(s[n],pad_len=max_len,**kwargs) for n in range(n_inp) ] for s in samples]
    return [(*p, *s[n_inp:]) for p,s in zip(padeds,samples)]

与基础的 pad_input 的区别在于,大部分填充首先应用(如果 pad_first=True)或在最后应用(如果 pad_first=False),但填充的大小仅为 seq_len 的整数倍。其余的填充则应用于末尾(或者在 pad_first=False 的情况下应用于开头)。这样做是为了与递归模型的 SequenceEncoder 一起使用。

pad_input_chunk([(TensorText([1,2,3,4,5,6]),TensorText([1,2]),1)], pad_idx=0, seq_len=3,n_inp=2)
[(TensorText([1, 2, 3, 4, 5, 6]), TensorText([0, 0, 0, 1, 2, 0]), 1)]
test_eq(pad_input_chunk([(tensor([1,2,3,4,5,6]),1), (tensor([1,2,3]), 2), (tensor([1,2]), 3)], pad_idx=0, seq_len=2), 
        [(tensor([1,2,3,4,5,6]),1), (tensor([0,0,1,2,3,0]),2), (tensor([0,0,0,0,1,2]), 3)])
test_eq(pad_input_chunk([(tensor([1,2,3,4,5,6]),), (tensor([1,2,3]),), (tensor([1,2]),)], pad_idx=0, seq_len=2), 
        [(tensor([1,2,3,4,5,6]),), (tensor([0,0,1,2,3,0]),), (tensor([0,0,0,0,1,2]),)])
test_eq(pad_input_chunk([(tensor([1,2,3,4,5,6]),), (tensor([1,2,3]),), (tensor([1,2]),)], pad_idx=0, seq_len=2, pad_first=False), 
        [(tensor([1,2,3,4,5,6]),), (tensor([1,2,3,0,0,0]),), (tensor([1,2,0,0,0,0]),)])

test_eq(pad_input_chunk([(TensorText([1,2,3,4,5,6]),TensorText([1,2]),1)], pad_idx=0, seq_len=2,n_inp=2), 
        [(TensorText([1,2,3,4,5,6]),TensorText([0,0,0,0,1,2]),1)])

pad_input_chunkTransform 版本。这个版本支持类型、解码以及 Transform 的其他功能。

class Pad_Chunk(DisplayedTransform):
    "Pad `samples` by adding padding by chunks of size `seq_len`"
    def __init__(self, pad_idx=1, pad_first=True, seq_len=72,decode=True,**kwargs):
        store_attr('pad_idx, pad_first, seq_len,seq_len')
        super().__init__(**kwargs)
    def before_call(self, b):
        "Set `self.max_len` before encodes" 
        self.max_len = max([x.shape[0] for xs in b for x in xs if isinstance(x,TensorText)])
    def __call__(self, b, **kwargs):
        self.before_call(b)
        return super().__call__(tuple(b), **kwargs)
    def encodes(self, x:TensorText):
        return pad_chunk(x,pad_idx=self.pad_idx, pad_first=self.pad_first, seq_len=self.seq_len, pad_len=self.max_len)
    def decodes(self, o:TensorText):
        return o[o != self.pad_idx] if self.decode else o

这是Pad_Chunk的示例

pc=Pad_Chunk(pad_idx=0,seq_len=3)
out=pc([(TensorText([1,2,3,4,5,6]),TensorText([1,2]),1)])
print('Inputs:  ',*[(TensorText([1,2,3,4,5,6]),TensorText([1,2]),1)])
print('Encoded: ',*out)
print('Decoded: ',*pc.decode(out))
Inputs:   (TensorText([1, 2, 3, 4, 5, 6]), TensorText([1, 2]), 1)
Encoded:  (TensorText([1, 2, 3, 4, 5, 6]), TensorText([0, 0, 0, 1, 2, 0]), 1)
Decoded:  (TensorText([1, 2, 3, 4, 5, 6]), TensorText([1, 2]), 1)
pc=Pad_Chunk(pad_idx=0, seq_len=2)
test_eq(pc([(TensorText([1,2,3,4,5,6]),1), (TensorText([1,2,3]), 2), (TensorText([1,2]), 3)]), 
        [(tensor([1,2,3,4,5,6]),1), (tensor([0,0,1,2,3,0]),2), (tensor([0,0,0,0,1,2]), 3)])

pc=Pad_Chunk(pad_idx=0, seq_len=2)
test_eq(pc([(TensorText([1,2,3,4,5,6]),), (TensorText([1,2,3]),), (TensorText([1,2]),)]), 
        [(tensor([1,2,3,4,5,6]),), (tensor([0,0,1,2,3,0]),), (tensor([0,0,0,0,1,2]),)])

pc=Pad_Chunk(pad_idx=0, seq_len=2, pad_first=False)
test_eq(pc([(TensorText([1,2,3,4,5,6]),), (TensorText([1,2,3]),), (TensorText([1,2]),)]), 
        [(tensor([1,2,3,4,5,6]),), (tensor([1,2,3,0,0,0]),), (tensor([1,2,0,0,0,0]),)])

pc=Pad_Chunk(pad_idx=0, seq_len=2)
test_eq(pc([(TensorText([1,2,3,4,5,6]),TensorText([1,2]),1)]), 
        [(TensorText([1,2,3,4,5,6]),TensorText([0,0,0,0,1,2]),1)])
def _default_sort(x): return len(x[0])

@delegates(TfmdDL)
class SortedDL(TfmdDL):
    "A `DataLoader` that goes throught the item in the order given by `sort_func`"
    def __init__(self, dataset, sort_func=None, res=None, **kwargs):
        super().__init__(dataset, **kwargs)
        self.sort_func = _default_sort if sort_func is None else sort_func
        if res is None and self.sort_func == _default_sort: res = _get_lengths(dataset)
        self.res = [self.sort_func(self.do_item(i)) for i in range_of(self.dataset)] if res is None else res
        if len(self.res) > 0: self.idx_max = np.argmax(self.res)

    def get_idxs(self):
        idxs = super().get_idxs()
        if self.shuffle: return idxs
        return sorted(idxs, key=lambda i: self.res[i], reverse=True)

    def shuffle_fn(self,idxs):
        idxs = np.random.permutation(len(self.dataset))
        idx_max = np.where(idxs==self.idx_max)[0][0]
        idxs[0],idxs[idx_max] = idxs[idx_max],idxs[0]
        sz = self.bs*50
        chunks = [idxs[i:i+sz] for i in range(0, len(idxs), sz)]
        chunks = [sorted(s, key=lambda i: self.res[i], reverse=True) for s in chunks]
        sort_idx = np.concatenate(chunks)

        sz = self.bs
        batches = [sort_idx[i:i+sz] for i in range(0, len(sort_idx), sz)]
        sort_idx = np.concatenate(np.random.permutation(batches[1:-1])) if len(batches) > 2 else np.array([],dtype=int)
        sort_idx = np.concatenate((batches[0], sort_idx) if len(batches)==1 else (batches[0], sort_idx, batches[-1]))
        return iter(sort_idx)

    @delegates(TfmdDL.new)
    def new(self, dataset=None, **kwargs):
        if 'val_res' in kwargs and kwargs['val_res'] is not None: res = kwargs['val_res']
        else: res = self.res if dataset is None else None
        return super().new(dataset=dataset, res=res, **kwargs)

res 是将 sort_func 应用于 dataset 中所有元素的结果。如果可用,您可以将其传递以通过避免对整个数据集的初始遍历来加快初始化速度。例如,如果按文本长度排序(如默认的 sort_func,称为 _default_sort),您应该将一个包含 dataset 中每个元素长度的列表传递给 res 以利用这种加速。

为了获得验证集相同的初始化加速,可以将 val_res(您验证集的文本长度列表)传递给 SortedDLkwargs 参数。以下是通过同时传递训练集和验证集的文本长度列表来减少初始化时间的示例:

# 将训练数据集的文本长度传递给 SortedDL
srtd_dl=partial(SortedDL, res = train_text_lens)

# 传递验证数据集的文本长度 
dl_kwargs = [{},{'val_res': val_text_lens}]

# 初始化我们的数据集 
dsets = Datasets(...)   

# 初始化我们的数据加载器
dls = dsets.dataloaders(..., dl_type = srtd_dl, dl_kwargs = dl_kwargs)

如果 shuffleTrue,这将稍微打乱排序结果,以便在批次中具有大致相同大小的项目,但不是按照确切的排序顺序。

ds = [(tensor([1,2]),1), (tensor([3,4,5,6]),2), (tensor([7]),3), (tensor([8,9,10]),4)]
dl = SortedDL(ds, bs=2, before_batch=partial(pad_input, pad_idx=0))
test_eq(list(dl), [(tensor([[ 3,  4,  5,  6], [ 8,  9, 10,  0]]), tensor([2, 4])), 
                   (tensor([[1, 2], [7, 0]]), tensor([1, 3]))])
ds = [(tensor(range(random.randint(1,10))),i) for i in range(101)]
dl = SortedDL(ds, bs=2, create_batch=partial(pad_input, pad_idx=-1), shuffle=True, num_workers=0)
batches = list(dl)
max_len = len(batches[0][0])
for b in batches: 
    assert(len(b[0])) <= max_len 
    test_ne(b[0][-1], -1)

TransformBlock用于文本

要使用数据块 API,您需要这个文本构建块。

class TextBlock(TransformBlock):
    "A `TransformBlock` for texts"
    @delegates(Numericalize.__init__)
    def __init__(self, tok_tfm, vocab=None, is_lm=False, seq_len=72, backwards=False, **kwargs):
        type_tfms = [tok_tfm, Numericalize(vocab, **kwargs)]
        if backwards: type_tfms += [reverse_text]
        return super().__init__(type_tfms=type_tfms,
                                dl_type=LMDataLoader if is_lm else SortedDL,
                                dls_kwargs={'seq_len': seq_len} if is_lm else {'before_batch': Pad_Chunk(seq_len=seq_len)})

    @classmethod
    @delegates(Tokenizer.from_df, keep=True)
    def from_df(cls, text_cols, vocab=None, is_lm=False, seq_len=72, backwards=False, min_freq=3, max_vocab=60000, **kwargs):
        "Build a `TextBlock` from a dataframe using `text_cols`"
        return cls(Tokenizer.from_df(text_cols, **kwargs), vocab=vocab, is_lm=is_lm, seq_len=seq_len,
                   backwards=backwards, min_freq=min_freq, max_vocab=max_vocab)

    @classmethod
    @delegates(Tokenizer.from_folder, keep=True)
    def from_folder(cls, path, vocab=None, is_lm=False, seq_len=72, backwards=False, min_freq=3, max_vocab=60000, **kwargs):
        "Build a `TextBlock` from a `path`"
        return cls(Tokenizer.from_folder(path, **kwargs), vocab=vocab, is_lm=is_lm, seq_len=seq_len,
                   backwards=backwards, min_freq=min_freq, max_vocab=max_vocab)

为了高效的标记化,您可能希望使用工厂方法之一。否则,您可以传递自定义的 tok_tfm 来处理标记化(如果您的文本已经被标记化,您可以传递 noop),一个 vocab,或者让它通过使用 min_freqmax_vocab 在文本上进行推断。

is_lm 表示我们是否希望使用文本进行语言建模或其他任务,seq_len 仅在 is_lm=False 时需要调整,并传递给 pad_input_chunk

show_doc(TextBlock.from_df)

TextBlock.from_df[source]

TextBlock.from_df(text_cols, vocab=None, is_lm=False, seq_len=72, backwards=False, min_freq=3, max_vocab=60000, tok=None, rules=None, sep=' ', n_workers=8, mark_fields=None, tok_text_col='text', **kwargs)

Build a TextBlock from a dataframe using text_cols

以下是使用存储为CSV文件的IMDB样本的示例:

path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')

imdb_clas = DataBlock(
    blocks=(TextBlock.from_df('text', seq_len=72), CategoryBlock),
    get_x=ColReader('text'), get_y=ColReader('label'), splitter=ColSplitter())

dls = imdb_clas.dataloaders(df, bs=64)
dls.show_batch(max_n=2)
text category
0 xxbos xxmaj raising xxmaj victor xxmaj vargas : a xxmaj review \n\n xxmaj you know , xxmaj raising xxmaj victor xxmaj vargas is like sticking your hands into a big , xxunk bowl of xxunk . xxmaj it 's warm and gooey , but you 're not sure if it feels right . xxmaj try as i might , no matter how warm and gooey xxmaj raising xxmaj victor xxmaj vargas became i was always aware that something did n't quite feel right . xxmaj victor xxmaj vargas suffers from a certain xxunk on the director 's part . xxmaj apparently , the director thought that the ethnic backdrop of a xxmaj latino family on the lower east side , and an xxunk storyline would make the film critic proof . xxmaj he was right , but it did n't fool me . xxmaj raising xxmaj victor xxmaj vargas is negative
1 xxbos xxup the xxup shop xxup around xxup the xxup corner is one of the xxunk and most feel - good romantic comedies ever made . xxmaj there 's just no getting around that , and it 's hard to actually put one 's feeling for this film into words . xxmaj it 's not one of those films that tries too hard , nor does it come up with the xxunk possible scenarios to get the two protagonists together in the end . xxmaj in fact , all its charm is xxunk , contained within the characters and the setting and the plot … which is highly believable to xxunk . xxmaj it 's easy to think that such a love story , as beautiful as any other ever told , * could * happen to you … a feeling you do n't often get from other romantic comedies positive

vocabis_lmseq_lenmin_freqmax_vocab 被传递给主初始化,其他参数则传递给 Tokenizer.from_df

show_doc(TextBlock.from_folder)

TextBlock.from_folder[source]

TextBlock.from_folder(path, vocab=None, is_lm=False, seq_len=72, backwards=False, min_freq=3, max_vocab=60000, tok=None, rules=None, extensions=None, folders=None, output_dir=None, skip_if_exists=True, output_names=None, n_workers=8, encoding='utf8', **kwargs)

Build a TextBlock from a path

vocabis_lmseq_lenmin_freqmax_vocab 被传递给主初始化,其他参数被传递给 Tokenizer.from_folder

文本数据加载器 -

class TextDataLoaders(DataLoaders):
    "Basic wrapper around several `DataLoader`s with factory methods for NLP problems"
    @classmethod
    @delegates(DataLoaders.from_dblock)
    def from_folder(cls, path, train='train', valid='valid', valid_pct=None, seed=None, vocab=None, text_vocab=None, is_lm=False,
                    tok_tfm=None, seq_len=72, splitter=None, backwards=False, **kwargs):
        "Create from imagenet style dataset in `path` with `train` and `valid` subfolders (or provide `valid_pct`)"
        if splitter is None:
            splitter = GrandparentSplitter(train_name=train, valid_name=valid) if valid_pct is None else RandomSplitter(valid_pct, seed=seed)
        blocks = [TextBlock.from_folder(path, text_vocab, is_lm, seq_len, backwards, tok=tok_tfm)]
        if not is_lm: blocks.append(CategoryBlock(vocab=vocab))
        get_items = partial(get_text_files, folders=[train,valid]) if valid_pct is None else get_text_files
        dblock = DataBlock(blocks=blocks,
                           get_items=get_items,
                           splitter=splitter,
                           get_y=None if is_lm else parent_label)
        return cls.from_dblock(dblock, path, path=path, seq_len=seq_len, **kwargs)

    @classmethod
    @delegates(DataLoaders.from_dblock)
    def from_df(cls, df, path='.', valid_pct=0.2, seed=None, text_col=0, label_col=1, label_delim=None, y_block=None,
                text_vocab=None, is_lm=False, valid_col=None, tok_tfm=None, tok_text_col="text", seq_len=72, backwards=False, **kwargs):
        "Create from `df` in `path` with `valid_pct`"
        blocks = [TextBlock.from_df(text_col, text_vocab, is_lm, seq_len, backwards, tok=tok_tfm)]
        if y_block is None and not is_lm:
            blocks.append(MultiCategoryBlock if is_listy(label_col) and len(label_col) > 1 else CategoryBlock)
        if y_block is not None and not is_lm: blocks += (y_block if is_listy(y_block) else [y_block])
        splitter = RandomSplitter(valid_pct, seed=seed) if valid_col is None else ColSplitter(valid_col)
        dblock = DataBlock(blocks=blocks,
                           get_x=ColReader(tok_text_col),
                           get_y=None if is_lm else ColReader(label_col, label_delim=label_delim),
                           splitter=splitter)
        return cls.from_dblock(dblock, df, path=path, seq_len=seq_len, **kwargs)

    @classmethod
    def from_csv(cls, path, csv_fname='labels.csv', header='infer', delimiter=None, quoting=csv.QUOTE_MINIMAL, **kwargs):
        "Create from `csv` file in `path/csv_fname`"
        df = pd.read_csv(Path(path)/csv_fname, header=header, delimiter=delimiter, quoting=quoting)
        return cls.from_df(df, path=path, **kwargs)

TextDataLoaders.from_csv = delegates(to=TextDataLoaders.from_df)(TextDataLoaders.from_csv)
show_doc(TextDataLoaders, title_level=2)

class TextDataLoaders[source]

TextDataLoaders(*loaders, path:(str, Path)='.', device=None) :: DataLoaders

Basic wrapper around several DataLoaders with factory methods for NLP problems

您不应该直接使用初始化,而应该使用以下工厂方法之一。所有这些工厂方法都接受以下参数:

  • text_vocab:用于文本数字化的词汇(如果未传递,则从数据中推断)
  • tok_tfm:如果传递,则使用此 tok_tfm 而不是默认值
  • seq_len:用于批处理的序列长度
  • bs:批处理大小
  • val_bs:验证 DataLoader 的批处理大小(默认为 bs
  • shuffle_train:我们是否打乱训练 DataLoader
  • device:要使用的 PyTorch 设备(默认为 default_device()
show_doc(TextDataLoaders.from_folder)

TextDataLoaders.from_folder[source]

TextDataLoaders.from_folder(path, train='train', valid='valid', valid_pct=None, seed=None, vocab=None, text_vocab=None, is_lm=False, tok_tfm=None, seq_len=72, splitter=None, backwards=False, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)

Create from imagenet style dataset in path with train and valid subfolders (or provide valid_pct)

如果提供了 valid_pct,则会执行随机拆分(可选地使用 seed),通过将该百分比的数据留出作为验证集(而不是查看祖父母文件夹)。如果传递了 vocab,则仅保留名称在 vocab 中的文件夹。

以下是 IMDB 电影评论数据集的一个示例:

::: {#cell-77 .cell 0=‘缓’ 1=‘慢’}

path = untar_data(URLs.IMDB)
dls = TextDataLoaders.from_folder(path)
dls.show_batch(max_n=3)
text category
0 xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero pos
1 xxbos xxmaj okay , so xxmaj i 'm not a big video game buff , but was the game xxmaj house of the xxmaj dead really famous enough to make a movie from ? xxmaj sure , they went as far as to actually put in quick video game clips throughout the movie , as though justifying any particular scene of violence , but there are dozens and dozens of games that look exactly the same , with the hand in the bottom on the screen , supposedly your own , holding whatever weapon and goo - ing all kinds of aliens or walking dead or snipers or whatever the case may be . \n\n xxmaj it 's an interesting premise in xxmaj house of the xxmaj dead , with a lot of college kids ( loaded college kids , as it were , kids who are able to pay neg
2 xxbos xxup anchors xxup aweigh sees two eager young sailors , xxmaj joe xxmaj brady ( gene xxmaj kelly ) and xxmaj clarence xxmaj doolittle / xxmaj brooklyn ( frank xxmaj sinatra ) , get a special four - day shore leave . xxmaj eager to get to the girls , particularly xxmaj joe 's xxmaj lola , neither xxmaj joe nor xxmaj brooklyn figure on the interruption of little xxmaj navy - mad xxmaj donald ( dean xxmaj stockwell ) and his xxmaj aunt xxmaj susie ( kathryn xxmaj grayson ) . xxmaj unexperienced in the ways of females and courting , xxmaj brooklyn quickly enlists xxmaj joe to help him win xxmaj aunt xxmaj susie over . xxmaj along the way , however , xxmaj joe finds himself falling for the gal he thinks belongs to his best friend . xxmaj how is xxmaj brooklyn going to take pos

:::

# 测试 `tok_tfm` 参数是否有效
path = untar_data(URLs.IMDB)
tknzer = WordTokenizer()
dls = TextDataLoaders.from_folder(path, tok_tfm=tknzer, is_lm=True)
dls.show_batch(max_n=1)
dls = TextDataLoaders.from_folder(path, tok_tfm=tknzer, is_lm=False)
dls.show_batch(max_n=1)
text text_
0 xxbos xxmaj this film is a spicy little piece of film - making from xxmaj sam xxmaj fuller which gives xxmaj richard xxmaj widmark the chance to show of some of his best , most edgy acting in the role of xxmaj skip mccoy , a small - time thief who stumbles onto a military secret while picking beautiful xxmaj candy 's ( jean xxmaj peters ) pocket on a crowded bus xxmaj this film is a spicy little piece of film - making from xxmaj sam xxmaj fuller which gives xxmaj richard xxmaj widmark the chance to show of some of his best , most edgy acting in the role of xxmaj skip mccoy , a small - time thief who stumbles onto a military secret while picking beautiful xxmaj candy 's ( jean xxmaj peters ) pocket on a crowded bus .
text category
0 xxbos xxmaj match 1 : xxmaj tag xxmaj team xxmaj table xxmaj match xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley vs xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit xxmaj bubba xxmaj ray and xxmaj spike xxmaj dudley started things off with a xxmaj tag xxmaj team xxmaj table xxmaj match against xxmaj eddie xxmaj guerrero and xxmaj chris xxmaj benoit . xxmaj according to the rules of the match , both opponents have to go through tables in order to get the win . xxmaj benoit and xxmaj guerrero heated up early on by taking turns hammering first xxmaj spike and then xxmaj bubba xxmaj ray . a xxmaj german xxunk by xxmaj benoit to xxmaj bubba took the wind out of the xxmaj dudley brother . xxmaj spike tried to help his brother , but the referee restrained him while xxmaj benoit and xxmaj guerrero pos
show_doc(TextDataLoaders.from_df)

TextDataLoaders.from_df[source]

TextDataLoaders.from_df(df, path='.', valid_pct=0.2, seed=None, text_col=0, label_col=1, label_delim=None, y_block=None, text_vocab=None, is_lm=False, valid_col=None, tok_tfm=None, tok_text_col='text', seq_len=72, backwards=False, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)

Create from df in path with valid_pct

seed 可以选择性地传递以确保可重复性。text_collabel_col 和可选的 valid_col 是文本/标签的列索引或名称,以及验证标志。对于多标签问题,如果您的标签在一列中,并以特定字符分隔,可以传递 label_delim。应传递 y_block 以指示目标类型,以防库未正确推断。

此外,您可以使用 tok_text_col 指定标记化文本发送到的特定列。默认情况下,它们在标记化后存储在名为 text 的列中。

以下是 IMDB 子集的示例:

path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/"texts.csv"); df.head()
label text is_valid
0 negative Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff! False
1 positive This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som... False
2 negative Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li... False
3 positive Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie "Duty, Honor, Country" are not just mere words blathered from the lips of a high-brassed offic... False
4 negative This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr... False
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/"texts.csv")
df.columns = ['label', 'text_col', 'is_valid'] # 测试tok_text_col是否正常工作
dls = TextDataLoaders.from_df(df, path=path, text_col='text_col', label_col='label', valid_col='is_valid')
dl = dls.test_dl(["This movie was bad"])
x, = dl.one_batch()
test_eq(x.cpu(), TensorText([[2,8,21,29,25,97]]))
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/"texts.csv")
dls = TextDataLoaders.from_df(df, path=path, text_col='text', label_col='label', valid_col='is_valid')
dls.show_batch(max_n=3)
text category
0 xxbos xxmaj raising xxmaj victor xxmaj vargas : a xxmaj review \n\n xxmaj you know , xxmaj raising xxmaj victor xxmaj vargas is like sticking your hands into a big , xxunk bowl of xxunk . xxmaj it 's warm and gooey , but you 're not sure if it feels right . xxmaj try as i might , no matter how warm and gooey xxmaj raising xxmaj victor xxmaj vargas became i was always aware that something did n't quite feel right . xxmaj victor xxmaj vargas suffers from a certain xxunk on the director 's part . xxmaj apparently , the director thought that the ethnic backdrop of a xxmaj latino family on the lower east side , and an xxunk storyline would make the film critic proof . xxmaj he was right , but it did n't fool me . xxmaj raising xxmaj victor xxmaj vargas is negative
1 xxbos xxup the xxup shop xxup around xxup the xxup corner is one of the xxunk and most feel - good romantic comedies ever made . xxmaj there 's just no getting around that , and it 's hard to actually put one 's feeling for this film into words . xxmaj it 's not one of those films that tries too hard , nor does it come up with the xxunk possible scenarios to get the two protagonists together in the end . xxmaj in fact , all its charm is xxunk , contained within the characters and the setting and the plot … which is highly believable to xxunk . xxmaj it 's easy to think that such a love story , as beautiful as any other ever told , * could * happen to you … a feeling you do n't often get from other romantic comedies positive
2 xxbos xxmaj now that xxmaj che(2008 ) has finished its relatively short xxmaj australian cinema run ( extremely limited xxunk screen in xxmaj xxunk , after xxunk ) , i can xxunk join both xxunk of " at xxmaj the xxmaj movies " in taking xxmaj steven xxmaj soderbergh to task . \n\n xxmaj it 's usually satisfying to watch a film director change his style / subject , but xxmaj soderbergh 's most recent stinker , xxmaj the xxmaj girlfriend xxmaj xxunk ) , was also missing a story , so narrative ( and editing ? ) seem to suddenly be xxmaj soderbergh 's main challenge . xxmaj strange , after 20 - odd years in the business . xxmaj he was probably never much good at narrative , just xxunk it well inside " edgy " projects . \n\n xxmaj none of this excuses him this present , negative
# Test that "tok_tfm" parameter works in `from_df`
tknzer = WordTokenizer()
dls = TextDataLoaders.from_df(df, path=path, is_lm=False, tok_tfm=tknzer,
                              text_col='text', label_col='label', valid_col='is_valid'
                             )
dls.show_batch(max_n=1)
dls = TextDataLoaders.from_df(df, path=path, is_lm=True, tok_tfm=tknzer,
                              text_col='text', label_col='label', valid_col='is_valid'
                             )
dls.show_batch(max_n=1)
text category
0 xxbos xxmaj raising xxmaj victor xxmaj vargas : a xxmaj review \n\n xxmaj you know , xxmaj raising xxmaj victor xxmaj vargas is like sticking your hands into a big , xxunk bowl of xxunk . xxmaj it 's warm and gooey , but you 're not sure if it feels right . xxmaj try as i might , no matter how warm and gooey xxmaj raising xxmaj victor xxmaj vargas became i was always aware that something did n't quite feel right . xxmaj victor xxmaj vargas suffers from a certain xxunk on the director 's part . xxmaj apparently , the director thought that the ethnic backdrop of a xxmaj latino family on the lower east side , and an xxunk storyline would make the film critic proof . xxmaj he was right , but it did n't fool me . xxmaj raising xxmaj victor xxmaj vargas is negative
text text_
0 xxbos xxmaj this may have been made for the hell of it , but it was most probably the worst film i 've seen in years , xxmaj the best thing about the entire xxup dvd would be the case xxrep 3 ! xxmaj i 'm surprised that people took the time to make something so rubbish and yet spend money on it too , xxmaj i 'm glad i only rented xxmaj this may have been made for the hell of it , but it was most probably the worst film i 've seen in years , xxmaj the best thing about the entire xxup dvd would be the case xxrep 3 ! xxmaj i 'm surprised that people took the time to make something so rubbish and yet spend money on it too , xxmaj i 'm glad i only rented .
dls = TextDataLoaders.from_df(df, path=path, text_col='text', is_lm=True, valid_col='is_valid')
dls.show_batch(max_n=3)
text text_
0 xxbos xxmaj critics need to review what they class as a quality movie . i think the critics have seen too many actions films and have xxunk to the xxmaj matrix style of films . xxmaj xxunk is a breath of fresh air , a film with so many layers that one viewing is not enough to understand or appreciate this outstanding film . xxmaj xxunk von xxmaj xxunk shows that old xxmaj critics need to review what they class as a quality movie . i think the critics have seen too many actions films and have xxunk to the xxmaj matrix style of films . xxmaj xxunk is a breath of fresh air , a film with so many layers that one viewing is not enough to understand or appreciate this outstanding film . xxmaj xxunk von xxmaj xxunk shows that old styles
1 xxmaj xxunk is something ) , but noticeable moments of xxunk as he still struggles to find his humanity . xxmaj this xxunk of his for a real life could get boring , and almost did in xxmaj supremacy , but just works better in xxmaj ultimatum ( better script ) . \n\n i am reminded of a scene in " xxunk " ( the only good xxmaj pierce xxmaj xxunk xxmaj xxunk is something ) , but noticeable moments of xxunk as he still struggles to find his humanity . xxmaj this xxunk of his for a real life could get boring , and almost did in xxmaj supremacy , but just works better in xxmaj ultimatum ( better script ) . \n\n i am reminded of a scene in " xxunk " ( the only good xxmaj pierce xxmaj xxunk xxmaj bond
2 xxmaj mr . xxmaj julia , played his role equally as perfect . xxmaj it was interesting to see how reluctant xxmaj richard xxmaj dreyfuss was in replacing the dictator against his will . xxmaj but he became more confident and comfortable with the role as time passed . xxmaj since everything happens for a reason in life , i believe he was forced to replace the dictator because he was meant mr . xxmaj julia , played his role equally as perfect . xxmaj it was interesting to see how reluctant xxmaj richard xxmaj dreyfuss was in replacing the dictator against his will . xxmaj but he became more confident and comfortable with the role as time passed . xxmaj since everything happens for a reason in life , i believe he was forced to replace the dictator because he was meant to
show_doc(TextDataLoaders.from_csv)

TextDataLoaders.from_csv[source]

TextDataLoaders.from_csv(path, csv_fname='labels.csv', header='infer', delimiter=None, valid_pct=0.2, seed=None, text_col=0, label_col=1, label_delim=None, y_block=None, text_vocab=None, is_lm=False, valid_col=None, tok_tfm=None, tok_text_col='text', seq_len=72, backwards=False, bs:int=64, val_bs:int=None, shuffle:bool=True, device=None)

Create from csv file in path/csv_fname

打开包含 headerdelimiter 的csv文件,然后将所有其他参数传递给 TextDataLoaders.from_df

dls = TextDataLoaders.from_csv(path=path, csv_fname='texts.csv', text_col='text', label_col='label', valid_col='is_valid')
dls.show_batch(max_n=3)
text category
0 xxbos xxmaj raising xxmaj victor xxmaj vargas : a xxmaj review \n\n xxmaj you know , xxmaj raising xxmaj victor xxmaj vargas is like sticking your hands into a big , xxunk bowl of xxunk . xxmaj it 's warm and gooey , but you 're not sure if it feels right . xxmaj try as i might , no matter how warm and gooey xxmaj raising xxmaj victor xxmaj vargas became i was always aware that something did n't quite feel right . xxmaj victor xxmaj vargas suffers from a certain xxunk on the director 's part . xxmaj apparently , the director thought that the ethnic backdrop of a xxmaj latino family on the lower east side , and an xxunk storyline would make the film critic proof . xxmaj he was right , but it did n't fool me . xxmaj raising xxmaj victor xxmaj vargas is negative
1 xxbos xxup the xxup shop xxup around xxup the xxup corner is one of the xxunk and most feel - good romantic comedies ever made . xxmaj there 's just no getting around that , and it 's hard to actually put one 's feeling for this film into words . xxmaj it 's not one of those films that tries too hard , nor does it come up with the xxunk possible scenarios to get the two protagonists together in the end . xxmaj in fact , all its charm is xxunk , contained within the characters and the setting and the plot … which is highly believable to xxunk . xxmaj it 's easy to think that such a love story , as beautiful as any other ever told , * could * happen to you … a feeling you do n't often get from other romantic comedies positive
2 xxbos xxmaj now that xxmaj che(2008 ) has finished its relatively short xxmaj australian cinema run ( extremely limited xxunk screen in xxmaj xxunk , after xxunk ) , i can xxunk join both xxunk of " at xxmaj the xxmaj movies " in taking xxmaj steven xxmaj soderbergh to task . \n\n xxmaj it 's usually satisfying to watch a film director change his style / subject , but xxmaj soderbergh 's most recent stinker , xxmaj the xxmaj girlfriend xxmaj xxunk ) , was also missing a story , so narrative ( and editing ? ) seem to suddenly be xxmaj soderbergh 's main challenge . xxmaj strange , after 20 - odd years in the business . xxmaj he was probably never much good at narrative , just xxunk it well inside " edgy " projects . \n\n xxmaj none of this excuses him this present , negative

导出 -

from nbdev import nbdev_export
nbdev_export()
Converted 00_torch_core.ipynb.
Converted 01_layers.ipynb.
Converted 01a_losses.ipynb.
Converted 02_data.load.ipynb.
Converted 03_data.core.ipynb.
Converted 04_data.external.ipynb.
Converted 05_data.transforms.ipynb.
Converted 06_data.block.ipynb.
Converted 07_vision.core.ipynb.
Converted 08_vision.data.ipynb.
Converted 09_vision.augment.ipynb.
Converted 09b_vision.utils.ipynb.
Converted 09c_vision.widgets.ipynb.
Converted 10_tutorial.pets.ipynb.
Converted 10b_tutorial.albumentations.ipynb.
Converted 11_vision.models.xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_callback.core.ipynb.
Converted 13a_learner.ipynb.
Converted 13b_metrics.ipynb.
Converted 14_callback.schedule.ipynb.
Converted 14a_callback.data.ipynb.
Converted 15_callback.hook.ipynb.
Converted 15a_vision.models.unet.ipynb.
Converted 16_callback.progress.ipynb.
Converted 17_callback.tracker.ipynb.
Converted 18_callback.fp16.ipynb.
Converted 18a_callback.training.ipynb.
Converted 18b_callback.preds.ipynb.
Converted 19_callback.mixup.ipynb.
Converted 20_interpret.ipynb.
Converted 20a_distributed.ipynb.
Converted 20b_tutorial.distributed.ipynb.
Converted 21_vision.learner.ipynb.
Converted 22_tutorial.imagenette.ipynb.
Converted 23_tutorial.vision.ipynb.
Converted 24_tutorial.image_sequence.ipynb.
Converted 24_tutorial.siamese.ipynb.
Converted 24_vision.gan.ipynb.
Converted 30_text.core.ipynb.
Converted 31_text.data.ipynb.
Converted 32_text.models.awdlstm.ipynb.
Converted 33_text.models.core.ipynb.
Converted 34_callback.rnn.ipynb.
Converted 35_tutorial.wikitext.ipynb.
Converted 37_text.learner.ipynb.
Converted 38_tutorial.text.ipynb.
Converted 39_tutorial.transformers.ipynb.
Converted 40_tabular.core.ipynb.
Converted 41_tabular.data.ipynb.
Converted 42_tabular.model.ipynb.
Converted 43_tabular.learner.ipynb.
Converted 44_tutorial.tabular.ipynb.
Converted 45_collab.ipynb.
Converted 46_tutorial.collab.ipynb.
Converted 50_tutorial.datablock.ipynb.
Converted 60_medical.imaging.ipynb.
Converted 61_tutorial.medical_imaging.ipynb.
Converted 65_medical.text.ipynb.
Converted 70_callback.wandb.ipynb.
Converted 70a_callback.tensorboard.ipynb.
Converted 70b_callback.neptune.ipynb.
Converted 70c_callback.captum.ipynb.
Converted 70d_callback.comet.ipynb.
Converted 74_huggingface.ipynb.
Converted 97_test_utils.ipynb.
Converted 99_pytorch_doc.ipynb.
Converted dev-setup.ipynb.
Converted app_examples.ipynb.
Converted camvid.ipynb.
Converted distributed_app_examples.ipynb.
Converted migrating_catalyst.ipynb.
Converted migrating_ignite.ipynb.
Converted migrating_lightning.ipynb.
Converted migrating_pytorch.ipynb.
Converted migrating_pytorch_verbose.ipynb.
Converted ulmfit.ipynb.
Converted index.ipynb.
Converted quick_start.ipynb.
Converted tutorial.ipynb.