Understanding DataLoader in PyTorch

Could someone please help me w.r.t the key logic behind data loaders and there after build the intuition there after.

DataLoader is a class which you can extend to your purpose, can you ask more specifically what you want to know?


I am unable to get the purpose and the method of execution.

For example as follows:

class TransliterationDataLoader(Dataset):
    def __init__(self, filename):
        self.eng_words, self.hindi_words = self.readXmlDataset(filename, cleanHindiVocab)
        self.shuffle_indices = list(range(len(self.eng_words)))
        self.shuffle_start_index = 0
    def __len__(self):
        return len(self.eng_words)
    def __getitem__(self, idx):
        return self.eng_words[idx], self.hindi_words[idx]
    def readXmlDataset(self, filename, lang_vocab_cleaner):
        transliterationCorpus = ET.parse(filename).getroot()
        lang1_words = []
        lang2_words = []

        for line in transliterationCorpus:
            wordlist1 = cleanEnglishVocab(line[0].text)
            wordlist2 = lang_vocab_cleaner(line[1].text)

            # Skip noisy data
            if len(wordlist1) != len(wordlist2):
                print('Skipping: ', line[0].text, ' - ', line[1].text)

            for word in wordlist1:
            for word in wordlist2:

        return lang1_words, lang2_words
    def get_random_sample(self):
        return self.__getitem__(np.random.randint(len(self.eng_words)))
    def get_batch_from_array(self, batch_size, array):
        end = self.shuffle_start_index + batch_size
        batch = []
        if end >= len(self.eng_words):
            batch = [array[i] for i in self.shuffle_indices[0:end%len(self.eng_words)]]
            end = len(self.eng_words)
        return batch + [array[i] for i in self.shuffle_indices[self.shuffle_start_index : end]]
    def get_batch(self, batch_size, postprocess = True):
        eng_batch = self.get_batch_from_array(batch_size, self.eng_words)
        hindi_batch = self.get_batch_from_array(batch_size, self.hindi_words)
        self.shuffle_start_index += batch_size + 1
        # Reshuffle if 1 epoch is complete
        if self.shuffle_start_index >= len(self.eng_words):
            self.shuffle_start_index = 0
        return eng_batch, hindi_batch


1 Like

This might be of help:
Writing Custom Datasets, Dataloaders and Transforms - PyTorch Docs