Source code for csl.datasets.datasets

# -*- coding: utf-8 -*-
""" Datasets for the csl module

- CIFAR-10
- Fashion MNIST
- UCI's Adult
- ProPublica's COMPAS
- UTKFace

"""

import torch
import os, glob
import pandas as pd
import numpy as np
from PIL import Image


[docs]class CIFAR10: """CIFAR-10 dataset You can download the dataset in PyTorch tensor format from https://www.ocf.berkeley.edu/~chamon/data/cifar-10.zip ..warning:: For performance purposes, this class loads the full CIFAR-10 dataset to RAM. Even though it is less than 1 GB, you've been warned. Attributes ---------- train : `bool` `True` if training set or `False` otherwise. data : `torch.tensor` CIFAR-10 images. transform : `callable` Function applied to the data points before returning them. target : `torch.tensor` CIFAR-10 labels. target_transform : `callable` Function applied to the labels before returning them. Methods ------- __len__() Return size of dataset. __get_item__() Return tuple (`torch.tensor`, `torch.tensor`) of images ([N] x [C = 3] x [H = 32] x [W = 32]) and label (N x 1). """ classes = ('Plane', 'Car', 'Bird', 'Cat', 'Deer', 'Dog', 'Frog', 'Horse', 'Ship', 'Truck') """CIFAR-10 labels (`list` [`str`])""" MEAN = [0.4914, 0.4822, 0.4465] """Average channel value over training set (`list` [`float`])""" SD = [0.2023, 0.1994, 0.2010] """Standard deviation of channel value over training set (`list` [`float`])"""
[docs] def __init__(self, root, train=True, subset=None, transform=None, target_transform=None): """CIFAR-10 dataset constructor Parameters ---------- root : `str` Data folder. train : `bool`, optional Returns training set if `True` and test set if `False`. The default is `True` (training set). subset : `list`, optional Subset of indices of the dataset to use. The default is `None` (use the whole dataset). transform : `callable`, optional Transformation to apply to the data points. The default is `None`. target_transform : `callable`, optional Transformation to apply to the labels. The default is `None`. """ self.train = train if self.train: self.data = torch.load(os.path.join(root, 'cifar10_trainX')) self.target = torch.load(os.path.join(root, 'cifar10_trainY')) else: self.data = torch.load(os.path.join(root, 'cifar10_testX')) self.target = torch.load(os.path.join(root, 'cifar10_testY')) if subset is not None: self.data = self.data[subset,] self.target = self.target[subset] self.transform = transform self.target_transform = target_transform
def __getitem__(self, index): data, target = self.data[index,], self.target[index] # Unsqueeze if single data point if len(data.shape) == 3: data = data.unsqueeze(0) target = target.unsqueeze(0) if self.transform is not None: data = self.transform(data) if self.target_transform is not None: target = self.target_transform(target) return data, target
[docs] def __len__(self): return self.target.shape[0]
[docs]class FMNIST: """FASHION MNIST dataset You can download the dataset in PyTorch tensor format from https://www.ocf.berkeley.edu/~chamon/data/fmnist.zip ..warning:: For performance purposes, this class loads the full FMNIST dataset to RAM. Even though it is less than 1 GB, you've been warned Attributes ---------- train : `bool` `True` if training set or `False` otherwise. data : `torch.tensor` FMNIST images. transform : `callable` Function applied to the data points before returning them. target : `torch.tensor` FMNIST labels. target_transform : `callable` Function applied to the labels before returning them. Methods ------- __len__() Returns size of dataset. __get_item__() Return tuple (`torch.tensor`, `torch.tensor`) of images ([N] x [C = 1] x [H = 28] x [W = 28]) and label (N x 1). """ classes = ('T-shirt', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot') """FMNIST labels (`list` [`str`])""" MEAN = 0.1307 """Average channel value over training set (`float`)""" SD = 0.3081 """Standard deviation of channel value over training set (`float`)"""
[docs] def __init__(self, root, train=True, subset=None, transform=None, target_transform=None): """FASHION MNIST dataset constructor Parameters ---------- root : `str` Data folder. train : `bool`, optional Returns training set if `True` and test set if `False`. The default is `True` (training set). subset : `list`, optional Subset of indices of the dataset to use. The default is `None` (use the whole dataset). transform : `callable`, optional Transformation to apply to the data points. The default is `None`. target_transform : `callable`, optional Transformation to apply to the labels. The default is `None`. """ self.train = train if self.train: self.data = torch.load(os.path.join(root, 'fmnist_trainX')) self.target = torch.load(os.path.join(root, 'fmnist_trainY')) else: self.data = torch.load(os.path.join(root, 'fmnist_testX')) self.target = torch.load(os.path.join(root, 'fmnist_testY')) if subset is not None: self.data = self.data[subset,] self.target = self.target[subset] self.transform = transform self.target_transform = target_transform
def __getitem__(self, index): data, target = self.data[index,], self.target[index] # Unsqueeze if single data point if len(data.shape) == 3: data = data.unsqueeze(0) target = target.unsqueeze(0) if self.transform is not None: data = self.transform(data) if self.target_transform is not None: target = self.target_transform(target) return data, target
[docs] def __len__(self): return self.target.shape[0]
[docs]class Adult: """UCI's adult dataset You can download ``adult.data`` and ``adult.test`` from http://archive.ics.uci.edu/ml/datasets/Adult Attributes ---------- classes : `list` [`str`] Class labels. train : `bool` `True` if training set or `False` otherwise. data : `torch.tensor` Adult data points features. transform : `callable` Function applied to the data points before returning them. target : `torch.tensor` Adult data points labels. target_transform : `callable` Function applied to the labels before returning them. Methods ------- __len__() Returns size of dataset. __get_item__() Return tuple (`torch.tensor`, `torch.tensor`) of features (N x F) and label (N x 1). The number of features F depends on preprocessing (see ``preprocess``). """ variables = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income'] """List of variables in UCI's Adult dataset (`list` [`str`]).""" categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income'] """List of categorical variable names (`list` [`str`])."""
[docs] def __init__(self, root, target_name='income', train=True, preprocess=None, subset=None, transform=None, target_transform=None): """UCI's adult dataset constructor Parameters ---------- root : `str` Data folder. target_name : `str`, optional Name of target variable. The default is `income`. train : `bool`, optional Returns training set if `True` and test set if `False`. The default is `True` (training set). preprocess : `callable`, optional Transformations to apply before separating labels (e.g., binning, dummifying, etc.). subset : `list`, optional Subset of indices of the dataset to use. The default is `None` (use the whole dataset). transform : `callable`, optional Transformation to apply to the data points. The default is `None`. target_transform : `callable`, optional Transformation to apply to the labels. The default is `None`. """ self.classes = ('<= 50k', '> 50k') self.train = train # Read CSV file if self.train: self.data = pd.read_csv(os.path.join(root, 'adult.data'), sep = ",\s", header = None, names = Adult.variables, engine = 'python') else: self.data = pd.read_csv(os.path.join(root, 'adult.test'), sep = ",\s", header = None, names = Adult.variables, skiprows = 1, engine = 'python') self.data['income'].replace(regex = True, inplace = True, to_replace = r'\.', value = r'') # Declare categorical variables for var_name in Adult.categorical: self.data[var_name] = self.data[var_name].astype('category') # Preprocess data if preprocess is not None: self.data = preprocess(self.data) # Subset dataset if subset is not None: if type(subset) is int: self.data = self.data.iloc[[subset]] else: self.data = self.data.iloc[subset] # Recompute indices self.data.reset_index(drop=True, inplace=True) # Recover response variable self.target = self.data.filter(regex=f'^{target_name}', axis = 1) self.data = self.data.drop(self.target.columns, axis = 1) self.transform = transform self.target_transform = target_transform
def __getitem__(self, index): if type(index) is int: data, target = self.data.iloc[[index]], self.target.iloc[[index]] else: data, target = self.data.iloc[index], self.target.iloc[index] # Unsqueeze if single data point if len(data.shape) == 1: data = data.unsqueeze(0) target = target.unsqueeze(0) if self.transform is not None: data = self.transform(data) if self.target_transform is not None: target = self.target_transform(target) return data, target
[docs] def __len__(self): return self.target.shape[0]
[docs]class COMPAS: """ProPublica's COMPAS dataset You can download `compas-scores-two-years.csv` from https://github.com/propublica/compas-analysis Attributes ---------- classes : `list` [`str`] Class labels. train : `bool` `True` if training set or `False` otherwise. data : `torch.tensor` COMPAS data points features. transform : `callable` Function applied to the data points before returning them. target : `torch.tensor` COMPAS data points labels. target_transform : `callable` Function applied to the labels before returning them. Methods ------- __len__() Returns size of dataset. __get_item__() Return tuple (`torch.tensor`, `torch.tensor`) of features (N x F) and label (N x 1). The number of features F depends on preprocessing (see ``preprocess``). """ variables = ['sex', 'age', 'age_cat', 'race', 'decile_score', 'score_text', 'v_decile_score', 'v_score_text', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_charge_degree', 'is_recid', 'is_violent_recid', 'two_year_recid'] """List of variables retained from original ProPublica dataset (`list` [`str`]).""" categorical = ['sex', 'age_cat', 'race', 'score_text', 'v_score_text', 'c_charge_degree', 'is_recid', 'is_violent_recid', 'two_year_recid'] """List of categorical variable names (`list` [`str`])."""
[docs] def __init__(self, root, target_name='two_year_recid', train=True, split=0.7, preprocess=None, subset=None, transform=None, target_transform=None): """ProPublica's COMPAS dataset constructor Parameters ---------- root : `str` Data folder. target_name : `str`, optional Name of target variable. The default is `two_year_recid`. train : `bool`, optional Returns training set if `True` and test set if `False`. The default is `True` (training set). split : `float`, optional Percentage of dataset to keep for training. The dataset is split randomly between training and testing, but training and test set are deterministic, i.e., the sets returned are always the same. The default is 0.7. preprocess : `callable`, optional Transformations to apply before separating labels (e.g., binning, dummifying, etc.). subset : `list`, optional Subset of indices of the dataset to use. The default is `None` (use the whole dataset). transform : `callable`, optional Transformation to apply to the data points. The default is `None`. target_transform : `callable`, optional Transformation to apply to the labels. The default is `None`. """ self.train = train # Read CSV file self.data = pd.read_csv(os.path.join(root, 'compas-scores-two-years.csv')) # Drop repeated columns self.data = self.data.drop('decile_score.1', axis = 1) self.data = self.data.drop('priors_count.1', axis = 1) # Filter |days_b_screening_arrest| <= 30 (as in ProPublica analysis) self.data = self.data[(self.data['days_b_screening_arrest'] >= -30) & (self.data['days_b_screening_arrest'] <= 30)] # Random split N = self.data.shape[0] idx_list = np.random.RandomState(seed=42).permutation(N) split_idx = int(np.ceil(N*split)) train_idx = idx_list[:split_idx] test_idx = idx_list[split_idx:] # Normalize indices self.data.reset_index(drop=True, inplace=True) if self.train: self.data = self.data.iloc[train_idx,] else: self.data = self.data.iloc[test_idx,] # Renomarlize indices self.data.reset_index(drop=True, inplace=True) # Keep only columns of interest self.data = self.data[COMPAS.variables] # Declare categorical variables for var_name in COMPAS.categorical: self.data[var_name] = self.data[var_name].astype('category') if preprocess is not None: self.data = preprocess(self.data) # Subset data if subset is not None: if type(subset) is int: self.data = self.data.iloc[[subset]] else: self.data = self.data.iloc[subset] # Recompute indices self.data.reset_index(drop=True, inplace=True) # Recover response variable self.target = self.data.filter(regex=f'^{target_name}', axis = 1) self.data = self.data.drop(self.target.columns, axis = 1) self.transform = transform self.target_transform = target_transform
def __getitem__(self, index): if type(index) is int: data, target = self.data.iloc[[index]], self.target.iloc[[index]] else: data, target = self.data.iloc[index], self.target.iloc[index] # Unsqueeze if single data point if len(data.shape) == 1: data = data.unsqueeze(0) target = target.unsqueeze(0) if self.transform is not None: data = self.transform(data) if self.target_transform is not None: target = self.target_transform(target) return data, target
[docs] def __len__(self): return self.target.shape[0]
[docs]class UTK: """UTKFace dataset Download the dataset from https://susanqq.github.io/UTKFace/ and indicate the path to the UTKFace folder Attributes ---------- classes : `list` [`str`] Class labels train : `bool` `True` if training set or `False` otherwise. current_batch : `dict` Memoized dataset to speed-up consecutive requests for the same data. data : `panda.DataFrame` Data frame containing the targets and path to each image. Contrary to ``CIFAR-10`` or ``FMNIST``, ``UTKFace`` is never fully loaded into memory. transform : `callable` Function applied to the data points before returning them. target_transform : `callable` Function applied to the labels before returning them. Methods ------- __len__() Return size of dataset. __get_item__() Return tuple (`torch.tensor`, `pandas.DataFrame`) of image ([N] x [C = 3] x [H = 200] x [W = 200]) and label (N x 3). """ MEAN = [0.5970, 0.4569, 0.3911] """Average channel value over training set (`list` [`float`])""" SD = [0.2580, 0.2307, 0.2265] """Standard deviation of channel value over training set (`list` [`float`])"""
[docs] def __init__(self, root, train=True, split=0.7, preprocess=None, subset=None, transform=None, target_transform=None): """UTKFace dataset constructor Parameters ---------- root : `str` Data folder. train : `bool`, optional Returns training set if `True` and test set if `False`. The default is `True` (training set). split : `float`, optional Percentage of dataset to keep for training. The dataset is split randomly between training and testing, but training and test set are deterministic, i.e., the sets returned are always the same. The default is 0.7. preprocess : `callable`, optional Transformations to apply before separating labels (e.g., binning, dummifying, etc.). subset : `array`, list, or tensor, optional Subset of indices of the dataset to use. The default is `None` (use the whole dataset). transform : `callable`, optional Transformation to apply to the data points. The default is `None`. target_transform : `callable`, optional Transformation to apply to the labels. The default is `None`. """ self.train = train self.transform = transform self.target_transform = target_transform self.current_batch = {'batch_idx': None, 'data': None, 'target': None} # Load dataset files = glob.glob(os.path.join(root, 'UTKFace', '*.jpg')) self.data = [self._parse_file(file) for file in files] self.data = pd.DataFrame(self.data) self.data.columns = ['age', 'gender', 'race'] self.data['filename'] = files # Keep complete cases self.data = self.data.dropna() # Renomarlize indices self.data.reset_index(drop=True, inplace=True) # Set categorical variables # {0: 'White', 1: 'Black', 2: 'Asian', 3: 'Indian', 4: 'Others'} self.data['race'] = self.data['race'].astype('category') self.data['gender'] = self.data['gender'].astype('category') # Random split N = self.data.shape[0] idx_list = np.random.RandomState(seed=42).permutation(N) split_idx = int(np.ceil(N*split)) train_idx = idx_list[:split_idx] test_idx = idx_list[split_idx:] if self.train: self.data = self.data.iloc[train_idx,] else: self.data = self.data.iloc[test_idx,] # Renomarlize indices self.data.reset_index(drop=True, inplace=True) # Preprocess data if preprocess is not None: self.data = preprocess(self.data) # Subset data if subset is not None: if type(subset) is int: self.data = self.data.iloc[[subset]] else: self.data = self.data.iloc[subset] # Renomarlize indices self.data.reset_index(drop=True, inplace=True)
def __getitem__(self, index): # Get data subset if type(index) is int: df = self.data.iloc[[index]] else: df = self.data.iloc[index] if self.current_batch['batch_idx'] == set(df.index.values): # Load memoized batch samples = self.current_batch['data'] target = self.current_batch['target'] else: # Load batch from memory samples = [self._image_to_tensor(filename) for filename in df['filename']] samples = torch.stack(samples, dim=0).squeeze() if len(df) == 1: samples = samples.unsqueeze(0) target = df[['age', 'gender', 'race']] if self.transform is not None: samples = self.transform(samples) if self.target_transform is not None: target = self.target_transform(target) # Memoize batch self.current_batch['batch_idx'] = set(df.index.values) self.current_batch['data'] = samples self.current_batch['target'] = target return samples, target
[docs] def __len__(self): return self.data.shape[0]
@staticmethod def _parse_file(filename): """Extract information about data point from filename. """ try: age, gender, race, _ = os.path.split(filename)[1].split('_') return int(age), int(gender), int(race) except Exception: return None, None, None @staticmethod def _image_to_tensor(filename): """Transform PIL image to tensor and normalize values to [0,1] """ # Load image pic = Image.open(filename) pic = pic.resize((100,100)) # Convert to tensor img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) img = img.view(pic.size[1], pic.size[0], len(pic.getbands())) # Convert to C x H x W format img = img.permute((2, 0, 1)).contiguous() # Return [0,1] image return img.float()/255.0