数据集:
tasksource/oasst1_dense_flat
许可:
apache-2.0OASST1 dataset 但是使用检索得到的 parent_text,只保留具有密集注释的消息(所有标签有2个注释者)
from datasets import Dataset, DatasetDict d={} for split in ['train','validation']: df=load_dataset("OpenAssistant/oasst1")[split].to_pandas() m2t=df.set_index("message_id")['text'].to_dict() df['parent_text']=df.parent_id.map(lambda x: m2t.get(x,'')) df=df[df.labels.map(lambda x:x!=None)] df=df[df.labels.map(lambda x:x['count'].min()>2)] labels=df.labels.map(lambda x:list(x['name'])).value_counts().index[0] df=df[df.labels.map(lambda x:x!=None)] df=df[df.labels.map(lambda x:list(x['name'])==labels)] for label in labels: df[label]=df.labels.map(lambda x: x['value'][list(x['name']).index(label)]) d[split]=Dataset.from_pandas(df,preserve_index=False) DatasetDict(d).push_to_hub('oasst1_dense_flat')
https://github.com/LAION-AI/Open-Assistant
@article{kopf2023openassistant, title={OpenAssistant Conversations--Democratizing Large Language Model Alignment}, author={K{\"o}pf, Andreas and Kilcher, Yannic and von R{\"u}tte, Dimitri and Anagnostidis, Sotiris and Tam, Zhi-Rui and Stevens, Keith and Barhoum, Abdullah and Duc, Nguyen Minh and Stanley, Oliver and Nagyfi, Rich{\'a}rd and others}, journal={arXiv preprint arXiv:2304.07327}, year={2023} }