lmops
29 строк · 972.0 Байт
1'''
2for knn search
3'''
4from datasets import load_dataset
5from typing import Any, Dict, Iterable
6import torch
7import pandas as pd
8import tqdm
9
10class IndexerDatasetReader(torch.utils.data.Dataset):
11def __init__(self, tokenizer, data) -> None:
12self.tokenizer = tokenizer
13self.dataset=data
14
15def __getitem__(self, index):
16return self.text_to_instance(self.dataset[index],index=index)
17
18def __len__(self):
19return len(self.dataset)
20
21def text_to_instance(self, entry: Dict[str, Any],index=-1):
22enc_text = entry['instruction']
23tokenized_inputs = self.tokenizer.encode_plus(enc_text,truncation=True,return_tensors='pt')
24return {
25'input_ids': tokenized_inputs.input_ids.squeeze(),
26'attention_mask': tokenized_inputs.attention_mask.squeeze(),
27"metadata":{"id":index}
28
29}
30
31
32