Source: tokenizers
Section: python
Priority: optional
Maintainer: Debian Deep Learning Team <debian-ai@lists.debian.org>
Uploaders: Kohei Sendai <kouhei.sendai@gmail.com>
Rules-Requires-Root: no
#Testsuite: autopkgtest-pkg-python
Standards-Version: 4.7.2
Homepage: https://github.com/huggingface/tokenizers
Vcs-Browser: https://salsa.debian.org/deeplearning-team/tokenizers
Vcs-Git: https://salsa.debian.org/deeplearning-team/tokenizers.git
Build-Depends: debhelper-compat (= 13),
               dh-sequence-python3,
               cargo,
               librust-derive-builder-dev,
               librust-env-logger-dev,
               librust-esaxx-rs-dev,
               librust-indicatif-dev,
               librust-itertools-dev,
               librust-libc-dev, librust-macro-rules-attribute-dev,
               librust-monostate-dev,
               librust-ndarray-dev,
               librust-numpy-dev,
               librust-onig-dev,
               librust-pyo3-dev,
               librust-rand-dev,
               librust-rayon-cond-dev, librust-rayon-dev,
               librust-regex-dev,
               librust-regex-syntax-dev,
               librust-serde-derive-dev,
               librust-serde-dev,
               librust-serde-json-dev,
               librust-spm-precompiled-dev,
               librust-thiserror-1-dev,
               librust-unicode-categories-dev,
               librust-unicode-normalization-alignments-dev,
               pybuild-plugin-pyproject,
               python3-all,
               python3-filelock,
               python3-fsspec,
               python3-huggingface-hub,
               python3-maturin,
               python3-packaging,
               python3-requests,
               python3-setuptools,
               python3-tqdm,
               python3-typing-extensions,
               python3-yaml,
               rustc

Package: python3-tokenizers
Section: python
Architecture: any
Depends: ${misc:Depends}, ${python3:Depends}, ${shlibs:Depends}
Description: Fast State-of-the-Art Tokenizers for Research and Production
 Provides an implementation of today's most used tokenizers, with a focus on
 performance and versatility.
 .
 Main features:
  * Train new vocabularies and tokenize, using today's most used tokenizers.
  * Extremely fast (both training and tokenization), thanks to the Rust
    implementation. Takes less than 20 seconds to tokenize a GB of text on
    a server's CPU.
  * Easy to use, but also extremely versatile.
  * Designed for research and production.
  * Normalization comes with alignments tracking. It's always possible to get
    the part of the original sentence that corresponds to a given token.
  * Does all the pre-processing: Truncate, Pad, add the special tokens your
    model needs.