# -*- coding: utf-8 -*-
""" Datasets for the csl module
- CIFAR-10
- Fashion MNIST
- UCI's Adult
- ProPublica's COMPAS
- UTKFace
"""
import torch
import os, glob
import pandas as pd
import numpy as np
from PIL import Image
[docs]class CIFAR10:
"""CIFAR-10 dataset
You can download the dataset in PyTorch tensor format from
https://www.ocf.berkeley.edu/~chamon/data/cifar-10.zip
..warning:: For performance purposes, this class loads the full
CIFAR-10 dataset to RAM. Even though it is less than 1 GB,
you've been warned.
Attributes
----------
train : `bool`
`True` if training set or `False` otherwise.
data : `torch.tensor`
CIFAR-10 images.
transform : `callable`
Function applied to the data points before returning them.
target : `torch.tensor`
CIFAR-10 labels.
target_transform : `callable`
Function applied to the labels before returning them.
Methods
-------
__len__()
Return size of dataset.
__get_item__()
Return tuple (`torch.tensor`, `torch.tensor`) of images
([N] x [C = 3] x [H = 32] x [W = 32]) and label (N x 1).
"""
classes = ('Plane', 'Car', 'Bird', 'Cat', 'Deer', 'Dog', 'Frog',
'Horse', 'Ship', 'Truck')
"""CIFAR-10 labels (`list` [`str`])"""
MEAN = [0.4914, 0.4822, 0.4465]
"""Average channel value over training set (`list` [`float`])"""
SD = [0.2023, 0.1994, 0.2010]
"""Standard deviation of channel value over training set (`list` [`float`])"""
[docs] def __init__(self, root, train=True, subset=None, transform=None,
target_transform=None):
"""CIFAR-10 dataset constructor
Parameters
----------
root : `str`
Data folder.
train : `bool`, optional
Returns training set if `True` and test set if `False`.
The default is `True` (training set).
subset : `list`, optional
Subset of indices of the dataset to use.
The default is `None` (use the whole dataset).
transform : `callable`, optional
Transformation to apply to the data points. The default is `None`.
target_transform : `callable`, optional
Transformation to apply to the labels. The default is `None`.
"""
self.train = train
if self.train:
self.data = torch.load(os.path.join(root, 'cifar10_trainX'))
self.target = torch.load(os.path.join(root, 'cifar10_trainY'))
else:
self.data = torch.load(os.path.join(root, 'cifar10_testX'))
self.target = torch.load(os.path.join(root, 'cifar10_testY'))
if subset is not None:
self.data = self.data[subset,]
self.target = self.target[subset]
self.transform = transform
self.target_transform = target_transform
def __getitem__(self, index):
data, target = self.data[index,], self.target[index]
# Unsqueeze if single data point
if len(data.shape) == 3:
data = data.unsqueeze(0)
target = target.unsqueeze(0)
if self.transform is not None:
data = self.transform(data)
if self.target_transform is not None:
target = self.target_transform(target)
return data, target
[docs] def __len__(self):
return self.target.shape[0]
[docs]class FMNIST:
"""FASHION MNIST dataset
You can download the dataset in PyTorch tensor format from
https://www.ocf.berkeley.edu/~chamon/data/fmnist.zip
..warning:: For performance purposes, this class loads the full
FMNIST dataset to RAM. Even though it is less than 1 GB,
you've been warned
Attributes
----------
train : `bool`
`True` if training set or `False` otherwise.
data : `torch.tensor`
FMNIST images.
transform : `callable`
Function applied to the data points before returning them.
target : `torch.tensor`
FMNIST labels.
target_transform : `callable`
Function applied to the labels before returning them.
Methods
-------
__len__()
Returns size of dataset.
__get_item__()
Return tuple (`torch.tensor`, `torch.tensor`) of images
([N] x [C = 1] x [H = 28] x [W = 28]) and label (N x 1).
"""
classes = ('T-shirt', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot')
"""FMNIST labels (`list` [`str`])"""
MEAN = 0.1307
"""Average channel value over training set (`float`)"""
SD = 0.3081
"""Standard deviation of channel value over training set (`float`)"""
[docs] def __init__(self, root, train=True, subset=None, transform=None,
target_transform=None):
"""FASHION MNIST dataset constructor
Parameters
----------
root : `str`
Data folder.
train : `bool`, optional
Returns training set if `True` and test set if `False`.
The default is `True` (training set).
subset : `list`, optional
Subset of indices of the dataset to use.
The default is `None` (use the whole dataset).
transform : `callable`, optional
Transformation to apply to the data points. The default is `None`.
target_transform : `callable`, optional
Transformation to apply to the labels. The default is `None`.
"""
self.train = train
if self.train:
self.data = torch.load(os.path.join(root, 'fmnist_trainX'))
self.target = torch.load(os.path.join(root, 'fmnist_trainY'))
else:
self.data = torch.load(os.path.join(root, 'fmnist_testX'))
self.target = torch.load(os.path.join(root, 'fmnist_testY'))
if subset is not None:
self.data = self.data[subset,]
self.target = self.target[subset]
self.transform = transform
self.target_transform = target_transform
def __getitem__(self, index):
data, target = self.data[index,], self.target[index]
# Unsqueeze if single data point
if len(data.shape) == 3:
data = data.unsqueeze(0)
target = target.unsqueeze(0)
if self.transform is not None:
data = self.transform(data)
if self.target_transform is not None:
target = self.target_transform(target)
return data, target
[docs] def __len__(self):
return self.target.shape[0]
[docs]class Adult:
"""UCI's adult dataset
You can download ``adult.data`` and ``adult.test`` from
http://archive.ics.uci.edu/ml/datasets/Adult
Attributes
----------
classes : `list` [`str`]
Class labels.
train : `bool`
`True` if training set or `False` otherwise.
data : `torch.tensor`
Adult data points features.
transform : `callable`
Function applied to the data points before returning them.
target : `torch.tensor`
Adult data points labels.
target_transform : `callable`
Function applied to the labels before returning them.
Methods
-------
__len__()
Returns size of dataset.
__get_item__()
Return tuple (`torch.tensor`, `torch.tensor`) of features (N x F)
and label (N x 1). The number of features F depends on preprocessing
(see ``preprocess``).
"""
variables = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
'marital-status', 'occupation', 'relationship', 'race',
'gender', 'capital-gain', 'capital-loss', 'hours-per-week',
'native-country','income']
"""List of variables in UCI's Adult dataset (`list` [`str`])."""
categorical = ['workclass', 'education', 'marital-status', 'occupation',
'relationship', 'race', 'gender', 'native-country', 'income']
"""List of categorical variable names (`list` [`str`])."""
[docs] def __init__(self, root, target_name='income', train=True, preprocess=None,
subset=None, transform=None, target_transform=None):
"""UCI's adult dataset constructor
Parameters
----------
root : `str`
Data folder.
target_name : `str`, optional
Name of target variable. The default is `income`.
train : `bool`, optional
Returns training set if `True` and test set if `False`.
The default is `True` (training set).
preprocess : `callable`, optional
Transformations to apply before separating labels
(e.g., binning, dummifying, etc.).
subset : `list`, optional
Subset of indices of the dataset to use.
The default is `None` (use the whole dataset).
transform : `callable`, optional
Transformation to apply to the data points. The default is `None`.
target_transform : `callable`, optional
Transformation to apply to the labels. The default is `None`.
"""
self.classes = ('<= 50k', '> 50k')
self.train = train
# Read CSV file
if self.train:
self.data = pd.read_csv(os.path.join(root, 'adult.data'), sep = ",\s",
header = None, names = Adult.variables, engine = 'python')
else:
self.data = pd.read_csv(os.path.join(root, 'adult.test'), sep = ",\s",
header = None, names = Adult.variables, skiprows = 1, engine = 'python')
self.data['income'].replace(regex = True, inplace = True, to_replace = r'\.', value = r'')
# Declare categorical variables
for var_name in Adult.categorical:
self.data[var_name] = self.data[var_name].astype('category')
# Preprocess data
if preprocess is not None:
self.data = preprocess(self.data)
# Subset dataset
if subset is not None:
if type(subset) is int:
self.data = self.data.iloc[[subset]]
else:
self.data = self.data.iloc[subset]
# Recompute indices
self.data.reset_index(drop=True, inplace=True)
# Recover response variable
self.target = self.data.filter(regex=f'^{target_name}', axis = 1)
self.data = self.data.drop(self.target.columns, axis = 1)
self.transform = transform
self.target_transform = target_transform
def __getitem__(self, index):
if type(index) is int:
data, target = self.data.iloc[[index]], self.target.iloc[[index]]
else:
data, target = self.data.iloc[index], self.target.iloc[index]
# Unsqueeze if single data point
if len(data.shape) == 1:
data = data.unsqueeze(0)
target = target.unsqueeze(0)
if self.transform is not None:
data = self.transform(data)
if self.target_transform is not None:
target = self.target_transform(target)
return data, target
[docs] def __len__(self):
return self.target.shape[0]
[docs]class COMPAS:
"""ProPublica's COMPAS dataset
You can download `compas-scores-two-years.csv` from
https://github.com/propublica/compas-analysis
Attributes
----------
classes : `list` [`str`]
Class labels.
train : `bool`
`True` if training set or `False` otherwise.
data : `torch.tensor`
COMPAS data points features.
transform : `callable`
Function applied to the data points before returning them.
target : `torch.tensor`
COMPAS data points labels.
target_transform : `callable`
Function applied to the labels before returning them.
Methods
-------
__len__()
Returns size of dataset.
__get_item__()
Return tuple (`torch.tensor`, `torch.tensor`) of features (N x F)
and label (N x 1). The number of features F depends on preprocessing
(see ``preprocess``).
"""
variables = ['sex', 'age', 'age_cat', 'race', 'decile_score', 'score_text',
'v_decile_score', 'v_score_text', 'juv_misd_count', 'juv_other_count',
'priors_count', 'c_charge_degree', 'is_recid', 'is_violent_recid',
'two_year_recid']
"""List of variables retained from original ProPublica dataset (`list` [`str`])."""
categorical = ['sex', 'age_cat', 'race', 'score_text', 'v_score_text',
'c_charge_degree', 'is_recid', 'is_violent_recid', 'two_year_recid']
"""List of categorical variable names (`list` [`str`])."""
[docs] def __init__(self, root, target_name='two_year_recid', train=True, split=0.7,
preprocess=None, subset=None, transform=None, target_transform=None):
"""ProPublica's COMPAS dataset constructor
Parameters
----------
root : `str`
Data folder.
target_name : `str`, optional
Name of target variable. The default is `two_year_recid`.
train : `bool`, optional
Returns training set if `True` and test set if `False`.
The default is `True` (training set).
split : `float`, optional
Percentage of dataset to keep for training. The dataset is split
randomly between training and testing, but training and test
set are deterministic, i.e., the sets returned are always the same.
The default is 0.7.
preprocess : `callable`, optional
Transformations to apply before separating labels
(e.g., binning, dummifying, etc.).
subset : `list`, optional
Subset of indices of the dataset to use.
The default is `None` (use the whole dataset).
transform : `callable`, optional
Transformation to apply to the data points. The default is `None`.
target_transform : `callable`, optional
Transformation to apply to the labels. The default is `None`.
"""
self.train = train
# Read CSV file
self.data = pd.read_csv(os.path.join(root, 'compas-scores-two-years.csv'))
# Drop repeated columns
self.data = self.data.drop('decile_score.1', axis = 1)
self.data = self.data.drop('priors_count.1', axis = 1)
# Filter |days_b_screening_arrest| <= 30 (as in ProPublica analysis)
self.data = self.data[(self.data['days_b_screening_arrest'] >= -30) &
(self.data['days_b_screening_arrest'] <= 30)]
# Random split
N = self.data.shape[0]
idx_list = np.random.RandomState(seed=42).permutation(N)
split_idx = int(np.ceil(N*split))
train_idx = idx_list[:split_idx]
test_idx = idx_list[split_idx:]
# Normalize indices
self.data.reset_index(drop=True, inplace=True)
if self.train:
self.data = self.data.iloc[train_idx,]
else:
self.data = self.data.iloc[test_idx,]
# Renomarlize indices
self.data.reset_index(drop=True, inplace=True)
# Keep only columns of interest
self.data = self.data[COMPAS.variables]
# Declare categorical variables
for var_name in COMPAS.categorical:
self.data[var_name] = self.data[var_name].astype('category')
if preprocess is not None:
self.data = preprocess(self.data)
# Subset data
if subset is not None:
if type(subset) is int:
self.data = self.data.iloc[[subset]]
else:
self.data = self.data.iloc[subset]
# Recompute indices
self.data.reset_index(drop=True, inplace=True)
# Recover response variable
self.target = self.data.filter(regex=f'^{target_name}', axis = 1)
self.data = self.data.drop(self.target.columns, axis = 1)
self.transform = transform
self.target_transform = target_transform
def __getitem__(self, index):
if type(index) is int:
data, target = self.data.iloc[[index]], self.target.iloc[[index]]
else:
data, target = self.data.iloc[index], self.target.iloc[index]
# Unsqueeze if single data point
if len(data.shape) == 1:
data = data.unsqueeze(0)
target = target.unsqueeze(0)
if self.transform is not None:
data = self.transform(data)
if self.target_transform is not None:
target = self.target_transform(target)
return data, target
[docs] def __len__(self):
return self.target.shape[0]
[docs]class UTK:
"""UTKFace dataset
Download the dataset from https://susanqq.github.io/UTKFace/ and indicate
the path to the UTKFace folder
Attributes
----------
classes : `list` [`str`]
Class labels
train : `bool`
`True` if training set or `False` otherwise.
current_batch : `dict`
Memoized dataset to speed-up consecutive requests for the same data.
data : `panda.DataFrame`
Data frame containing the targets and path to each image.
Contrary to ``CIFAR-10`` or ``FMNIST``, ``UTKFace`` is never fully loaded
into memory.
transform : `callable`
Function applied to the data points before returning them.
target_transform : `callable`
Function applied to the labels before returning them.
Methods
-------
__len__()
Return size of dataset.
__get_item__()
Return tuple (`torch.tensor`, `pandas.DataFrame`) of image
([N] x [C = 3] x [H = 200] x [W = 200]) and label (N x 3).
"""
MEAN = [0.5970, 0.4569, 0.3911]
"""Average channel value over training set (`list` [`float`])"""
SD = [0.2580, 0.2307, 0.2265]
"""Standard deviation of channel value over training set (`list` [`float`])"""
[docs] def __init__(self, root, train=True, split=0.7, preprocess=None,
subset=None, transform=None, target_transform=None):
"""UTKFace dataset constructor
Parameters
----------
root : `str`
Data folder.
train : `bool`, optional
Returns training set if `True` and test set if `False`.
The default is `True` (training set).
split : `float`, optional
Percentage of dataset to keep for training. The dataset is split
randomly between training and testing, but training and test
set are deterministic, i.e., the sets returned are always the same.
The default is 0.7.
preprocess : `callable`, optional
Transformations to apply before separating labels
(e.g., binning, dummifying, etc.).
subset : `array`, list, or tensor, optional
Subset of indices of the dataset to use.
The default is `None` (use the whole dataset).
transform : `callable`, optional
Transformation to apply to the data points. The default is `None`.
target_transform : `callable`, optional
Transformation to apply to the labels. The default is `None`.
"""
self.train = train
self.transform = transform
self.target_transform = target_transform
self.current_batch = {'batch_idx': None,
'data': None,
'target': None}
# Load dataset
files = glob.glob(os.path.join(root, 'UTKFace', '*.jpg'))
self.data = [self._parse_file(file) for file in files]
self.data = pd.DataFrame(self.data)
self.data.columns = ['age', 'gender', 'race']
self.data['filename'] = files
# Keep complete cases
self.data = self.data.dropna()
# Renomarlize indices
self.data.reset_index(drop=True, inplace=True)
# Set categorical variables
# {0: 'White', 1: 'Black', 2: 'Asian', 3: 'Indian', 4: 'Others'}
self.data['race'] = self.data['race'].astype('category')
self.data['gender'] = self.data['gender'].astype('category')
# Random split
N = self.data.shape[0]
idx_list = np.random.RandomState(seed=42).permutation(N)
split_idx = int(np.ceil(N*split))
train_idx = idx_list[:split_idx]
test_idx = idx_list[split_idx:]
if self.train:
self.data = self.data.iloc[train_idx,]
else:
self.data = self.data.iloc[test_idx,]
# Renomarlize indices
self.data.reset_index(drop=True, inplace=True)
# Preprocess data
if preprocess is not None:
self.data = preprocess(self.data)
# Subset data
if subset is not None:
if type(subset) is int:
self.data = self.data.iloc[[subset]]
else:
self.data = self.data.iloc[subset]
# Renomarlize indices
self.data.reset_index(drop=True, inplace=True)
def __getitem__(self, index):
# Get data subset
if type(index) is int:
df = self.data.iloc[[index]]
else:
df = self.data.iloc[index]
if self.current_batch['batch_idx'] == set(df.index.values):
# Load memoized batch
samples = self.current_batch['data']
target = self.current_batch['target']
else:
# Load batch from memory
samples = [self._image_to_tensor(filename) for filename in df['filename']]
samples = torch.stack(samples, dim=0).squeeze()
if len(df) == 1:
samples = samples.unsqueeze(0)
target = df[['age', 'gender', 'race']]
if self.transform is not None:
samples = self.transform(samples)
if self.target_transform is not None:
target = self.target_transform(target)
# Memoize batch
self.current_batch['batch_idx'] = set(df.index.values)
self.current_batch['data'] = samples
self.current_batch['target'] = target
return samples, target
[docs] def __len__(self):
return self.data.shape[0]
@staticmethod
def _parse_file(filename):
"""Extract information about data point from filename.
"""
try:
age, gender, race, _ = os.path.split(filename)[1].split('_')
return int(age), int(gender), int(race)
except Exception:
return None, None, None
@staticmethod
def _image_to_tensor(filename):
"""Transform PIL image to tensor and normalize values to [0,1]
"""
# Load image
pic = Image.open(filename)
pic = pic.resize((100,100))
# Convert to tensor
img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
img = img.view(pic.size[1], pic.size[0], len(pic.getbands()))
# Convert to C x H x W format
img = img.permute((2, 0, 1)).contiguous()
# Return [0,1] image
return img.float()/255.0