# Author: Robert McGibbon <rmcgibbo@gmail.com>
# Contributors:
# Copyright (c) 2014, Stanford University
# All rights reserved.
from __future__ import absolute_import, print_function, division
import sys
import os
import re
import glob
from os.path import join, exists, expanduser
import socket
import getpass
import itertools
from datetime import datetime
from collections import Sequence
import warnings
import tables
import mdtraj as md
from mdtraj.core.trajectory import _parse_topology
import numpy as np
from . import version
_PYTABLES_DISABLE_COMPRESSION = tables.Filters(complevel=0)
__all__ = ['dataset']
[docs]def dataset(path, mode='r', fmt=None, verbose=False, **kwargs):
"""Open a dataset object
MSMBuilder supports several dataset 'formats' for storing
lists of sequences on disk.
This function can also be used as a context manager.
Parameters
----------
path : str
The path to the dataset on the filesystem
mode : {'r', 'w', 'a'}
Open a dataset for reading, writing, or appending. Note that
some formats only support a subset of these modes.
fmt : {'dir-npy', 'hdf5', 'mdtraj'}
The format of the data on disk
``dir-npy``
A directory of binary numpy files, one file per sequence
``hdf5``
A single hdf5 file with each sequence as an array node
``mdtraj``
A read-only set of trajectory files that can be loaded
with mdtraj
``dir-npy-union`` or ``hdf5-union``
Several datasets of the respective type which will have
their features union-ed together.
verbose : bool
Whether to print information about the dataset
"""
if mode == 'r' and fmt is None:
fmt = _guess_format(path)
elif mode in 'wa' and fmt is None:
raise ValueError('mode="%s", but no fmt. fmt=%s' % (mode, fmt))
if fmt == 'dir-npy':
return NumpyDirDataset(path, mode=mode, verbose=verbose)
elif fmt == 'mdtraj':
return MDTrajDataset(path, mode=mode, verbose=verbose, **kwargs)
elif fmt == 'hdf5':
return HDF5Dataset(path, mode=mode, verbose=verbose)
elif fmt.endswith("-union"):
sub_fmt = fmt[:-len('-union')]
return UnionDataset(path, fmt=sub_fmt, mode=mode, verbose=verbose)
else:
raise NotImplementedError("unknown fmt: %s" % fmt)
def _guess_format(path):
"""Guess the format of a dataset based on its filename / filenames.
"""
if os.path.isdir(path):
return 'dir-npy'
if path.endswith('.h5') or path.endswith('.hdf5'):
# TODO: Check for mdtraj .h5 file
return 'hdf5'
# TODO: What about a list of trajectories, e.g. from command line nargs='+'
return 'mdtraj'
class _BaseDataset(Sequence):
_PROVENANCE_TEMPLATE = '''MSMBuilder Dataset:
MSMBuilder:\t{version}
Command:\t{cmdline}
Path:\t\t{path}
Username:\t{user}
Hostname:\t{hostname}
Date:\t\t{date}
Comments:\t\t{comments}
'''
_PREV_TEMPLATE = '''
== Derived from ==
{previous}
'''
def __init__(self, path, mode='r', verbose=False):
self.path = path
self.mode = mode
self.verbose = verbose
if mode not in ('r', 'w', 'a'):
raise ValueError('mode must be one of "r", "w", "a"')
if mode in 'wa':
if mode == 'w' and exists(path):
raise ValueError('File exists: %s' % path)
#os.makedirs(path, exist_ok=True) # (py3 only)
try:
os.makedirs(path)
except OSError:
pass
self._write_provenance()
def create_derived(self, out_path, comments='', fmt=None):
if fmt is None:
out_dataset = self.__class__(out_path, mode='w', verbose=self.verbose)
else:
out_dataset = dataset(out_path, mode='w', verbose=self.verbose, fmt=fmt)
out_dataset._write_provenance(previous=self.provenance, comments=comments)
return out_dataset
def apply(self, fn):
for key in self.keys():
yield fn(self.get(key))
def _build_provenance(self, previous=None, comments=''):
val = self._PROVENANCE_TEMPLATE.format(
version=version.full_version,
cmdline=' '.join(sys.argv),
user=getpass.getuser(),
hostname=socket.gethostname(),
path=self.path,
comments=comments,
date=datetime.now().strftime("%B %d, %Y %I:%M %p"))
if previous:
val += self._PREV_TEMPLATE.format(previous=previous)
return val
@property
def provenance(self):
raise NotImplementedError('implemented in subclass')
def _write_provenance(self, previous=None, comments=''):
raise NotImplementedError('implemented in subclass')
def __len__(self):
return sum(1 for xx in self.keys())
def __getitem__(self, i):
return self.get(i)
def __setitem__(self, i, x):
return self.set(i, x)
def __iter__(self):
for key in self.keys():
yield self.get(key)
def keys(self):
# keys()[i], get(i) and set(i, x) should all follow
# the same ordering convention for the indices / items.
raise NotImplementedError('implemeneted in subclass')
def items(self):
for key in self.keys():
yield (key, self.get(key))
def get(self, i):
raise NotImplementedError('implemeneted in subclass')
def set(self, i, x):
raise NotImplementedError('implemeneted in subclass')
def close(self):
pass
def flush(self):
pass
def __enter__(self):
return self
def __exit__(self, *exc_info):
self.close()
class NumpyDirDataset(_BaseDataset):
"""Mixtape dataset container
Parameters
----------
path : str
mode : {'r', 'w', 'a'}
Read, write, or append. If mode is set to 'a' or 'w',
duplicate keys will be overwritten.
Examples
--------
for X in Dataset('path/to/dataset'):
print X
"""
_ITEM_FORMAT = '%08d.npy'
_ITEM_RE = re.compile('(\d{8}).npy')
_PROVENANCE_FILE = 'PROVENANCE.txt'
def get(self, i, mmap=False):
if isinstance(i, slice):
items = []
start, stop, step = i.indices(len(self))
for ii in itertools.islice(itertools.count(), start, stop, step):
items.append(self.get(ii))
return items
mmap_mode = 'r' if mmap else None
filename = join(self.path, self._ITEM_FORMAT % i)
if self.verbose:
print('[NumpydirDataset] loading %s' % filename)
try:
return np.load(filename, mmap_mode)
except IOError as e:
raise IndexError(e)
def set(self, i, x):
if self.mode not in 'wa':
raise IOError('Dataset not opened for writing')
filename = join(self.path, self._ITEM_FORMAT % i)
if self.verbose:
print('[NumpydirDataset] saving %s' % filename)
return np.save(filename, x)
def keys(self):
for fn in sorted(os.listdir(os.path.expanduser(self.path)), key=_keynat):
match = self._ITEM_RE.match(fn)
if match:
yield int(match.group(1))
@property
def provenance(self):
try:
with open(join(self.path, self._PROVENANCE_FILE), 'r') as f:
return f.read()
except IOError:
return 'No available provenance'
def _write_provenance(self, previous=None, comments=''):
with open(join(self.path, self._PROVENANCE_FILE), 'w') as f:
p = self._build_provenance(previous=previous, comments=comments)
f.write(p)
class HDF5Dataset(_BaseDataset):
_ITEM_FORMAT = 'arr_%d'
_ITEM_RE = re.compile('arr_(\d+)')
def __init__(self, path, mode='r', verbose=False):
if mode not in ('r', 'w'):
raise ValueError('mode must be one of "r", "w"')
if mode == 'w':
if exists(path):
raise ValueError('File exists: %s' % path)
self._handle = tables.open_file(path, mode=mode,
filters=_PYTABLES_DISABLE_COMPRESSION)
self.path = path
self.mode = mode
self.verbose = verbose
if mode == 'w':
self._write_provenance()
def __getstate__(self):
# pickle does not like to pickle the pytables handle, so...
# self.flush()
return {'path': self.path, 'mode': self.mode, 'verbose': self.verbose}
def __setstate__(self, state):
self.path = state['path']
self.mode = state['mode']
self.verbose = state['verbose']
self._handle = tables.open_file(self.path, mode=self.mode,
filters=_PYTABLES_DISABLE_COMPRESSION)
def get(self, i, mmap=False):
if isinstance(i, slice):
items = []
start, stop, step = i.indices(len(self))
for ii in itertools.islice(itertools.count(), start, stop, step):
items.append(self.get(ii))
return items
return self._handle.get_node('/', self._ITEM_FORMAT % i)[:]
def keys(self):
nodes = self._handle.list_nodes('/')
for node in sorted(nodes, key=lambda x: _keynat(x.name)):
match = self._ITEM_RE.match(node.name)
if match:
yield int(match.group(1))
def set(self, i, x):
if 'w' not in self.mode:
raise IOError('Dataset not opened for writing')
try:
self._handle.create_carray('/', self._ITEM_FORMAT % i, obj=x)
except tables.exceptions.NodeError:
self._handle.remove_node('/', self._ITEM_FORMAT % i)
self.set(i, x)
@property
def provenance(self):
try:
return self._handle.root._v_attrs['provenance']
except KeyError:
return 'No available provenance'
def _write_provenance(self, previous=None, comments=''):
p = self._build_provenance(previous=previous, comments=comments)
self._handle.root._v_attrs['provenance'] = p
def close(self):
if hasattr(self, '_handle'):
self._handle.close()
def flush(self):
self._handle.flush()
def __del__(self):
self.close()
class MDTrajDataset(_BaseDataset):
_PROVENANCE_TEMPLATE = '''MDTraj dataset:
path:\t\t{path}
topology:\t{topology}
stride:\t{stride}
atom_indices\t{atom_indices}
'''
def __init__(self, path, mode='r', topology=None, stride=1,
atom_indices=None, verbose=False):
if mode != 'r':
raise ValueError('mode must be "r"')
self.path = path
self.topology = topology
self.stride = stride
self.atom_indices = atom_indices
self.verbose = verbose
if isinstance(path, list):
self.glob_matches = [expanduser(fn) for fn in path]
else:
self.glob_matches = sorted(glob.glob(expanduser(path)), key=_keynat)
if topology is None:
self._topology = None
else:
self._topology = _parse_topology(os.path.expanduser(topology))
def get(self, i):
if self.verbose:
print('[MDTraj dataset] loading %s' % self.filename(i))
if self._topology is None:
t = md.load(self.filename(i), stride=self.stride,
atom_indices=self.atom_indices)
else:
t = md.load(self.filename(i), stride=self.stride,
atom_indices=self.atom_indices, top=self._topology)
return t
def filename(self, i):
return self.glob_matches[i]
def iterload(self, i, chunk):
if self.verbose:
print('[MDTraj dataset] iterloading %s' % self.filename(i))
if self._topology is None:
return md.iterload(
self.filename(i), chunk=chunk, stride=self.stride,
atom_indices=self.atom_indices)
else:
return md.iterload(
self.filename(i), chunk=chunk, stride=self.stride,
atom_indices=self.atom_indices, top=self._topology)
def keys(self):
return iter(range(len(self.glob_matches)))
@property
def provenance(self):
return self._PROVENANCE_TEMPLATE.format(
path=self.path, topology=self.topology,
atom_indices=self.atom_indices, stride=self.stride)
def _dim_match(arr):
if arr.ndim == 1:
return arr[:, np.newaxis]
return arr
class UnionDataset(_BaseDataset):
def __init__(self, paths, mode, fmt='dir-npy', verbose=False):
warnings.warn("UnionDataset is deprecated. "
"Please use msmbuilder.featurizer.FeatureUnion")
# Check mode
if mode != 'r':
raise ValueError("Union datasets are read only")
# Check format
supported_subformats = ['dir-npy', 'hdf5']
if fmt not in supported_subformats:
err = "Format must be one of {}. You gave {}"
err = err.format(supported_subformats, fmt)
raise ValueError(err)
# Save parameters
self.verbose = verbose
self.datasets = [dataset(path, mode, fmt, verbose)
for path in paths]
# Sanity check
self._check_same_length()
def _check_same_length(self):
"""Check that the datasets are the same length"""
lens = []
for ds in self.datasets:
lens.append(
sum(1 for _ in ds.keys())
)
if len(set(lens)) > 1:
err = "Each dataset must be the same length. You gave: {}"
err = err.format(lens)
raise ValueError(err)
def keys(self):
return self.datasets[0].keys()
def get(self, i):
return np.concatenate([_dim_match(ds.get(i))
for ds in self.datasets], axis=1)
def close(self):
for ds in self.datasets:
ds.close()
def flush(self):
for ds in self.datasets:
ds.close()
@property
def provenance(self):
return "\n\n".join(ds.provenance for ds in self.datasets)
def _keynat(string):
"""A natural sort helper function for sort() and sorted()
without using regular expression.
"""
r = []
for c in string:
if c.isdigit():
if r and isinstance(r[-1], int):
r[-1] = r[-1] * 10 + int(c)
else:
r.append(int(c))
else:
r.append(9 + ord(c))
return r