这是 deduplicated Stack dataset 的一个子集。
生成方式如下:
from datasets import load_dataset, Dataset languages = ["css", "prolog", "c", "fortran", "solidity", "kotlin", "literate-agda", "julia", "java-server-pages", "isabelle", "idris", "lean", "powershell", "go", "erlang", "f-sharp", "ada", "pascal", "perl", "r", "protocol-buffer", "cmake", "sas", "ruby", "rust", "rmarkdown", "c-sharp", "smalltalk", "haskell", "maple", "mathematica", "ocaml", "makefile", "lua", "literate-coffeescript", "literate-haskell", "restructuredtext", "racket", "standard-ml", "systemverilog", "tex", "awk", "assembly", "alloy", "agda", "emacs-lisp", "dart", "cuda", "bluespec", "augeas", "batchfile", "tcsh", "stan", "scala", "tcl", "stata", "applescript", "shell", "clojure", "scheme", "antlr", "sparql", "sql", "glsl", "elm", "dockerfile", "cpp", "coffeescript", "common-lisp", "elixir", "groovy", "html", "java", "javascript", "markdown", "php", "python", "typescript", "verilog", "visual-basic", "vhdl", "thrift", "matlab", "yacc", "zig", "xslt", "json", "yaml"] def dset_gen(): for language in languages: dset = load_dataset("bigcode/the-stack-dedup", data_dir=f"data/{language}", streaming=True, split="train") sample = dset.take(250_000) for row in sample: yield row dset = Dataset.from_generator(dset_gen)
num_examples: 11658586 download_size: 28807934580 dataset_size: 78577965159
每个数据实例对应一个文件。文件的内容在“content”特征中提供,其他特征(“repository_name”,“licenses”等)提供一些元数据。请注意,一个给定的文件可以出现在多个满足我们的安全许可条件的仓库中。如果是这种情况,为了简单起见,只显示其中按字母顺序第一个的仓库。