from __future__ import print_function, division, absolute_import
from mdtraj.utils.six import PY2
from mdtraj.utils.six.moves import xrange
import sys
import types
import random
import numpy as np
try:
import fastcluster
except ImportError:
pass
import scipy.cluster.hierarchy
import mdtraj as md
from msmbuilder import metrics
from mdtraj import io
from msmbuilder.utils import uneven_zip
from multiprocessing import Pool
try:
from deap import dtm # has parallel map() implementation via mpi
except:
pass
import logging
logger = logging.getLogger(__name__)
#####################################################################
# #
# Begin Helper Functions #
# #
#####################################################################
[docs]def concatenate_trajectories(trajectories):
"""Concatenate a list of trajectories into a single long trajectory
Parameters
----------
trajectories : list
list of mdtraj.Trajectory object
Returns
-------
concat_traj : mdtraj.Trajectory
"""
assert len(trajectories) > 0, 'Please supply a list of trajectories'
concat_traj = trajectories[0]
for i in xrange(1, len(trajectories)):
# Use mdtraj operator overloading
concat_traj += trajectories[i]
return concat_traj
[docs]def concatenate_prep_trajectories(prep_trajectories, metric):
"""Concatenate a list of prepared trajectories and
create a single prepared_trajectory.
This is non-trivial because the RMSD and LPRMSD prepared
trajectories are not np.ndarrays ...
Parameters
----------
prep_trajectories : list
list of prepared trajectories
metric : msmbuilder.metrics.AbstractDistance metric subclass instance
metric used to prepare the trajectories. Needed for RMSD and LPRMSD
since concatenation requires recreating the prepared trajectory
Returns
-------
ptraj : prepared_trajectory
prepared trajectory instance, like that returned from
metric.prepare_trajectory
"""
if isinstance(prep_trajectories[0], np.ndarray):
ptraj = np.concatenate(prep_trajectories)
elif isinstance(prep_trajectories[0], RMSD.TheoData):
xyz = np.concatenate([p.XYZData[:, :, :p.NumAtoms] for p in prep_trajectories])
xyz = xyz.transpose((0, 2, 1))
ptraj = metric.TheoData(xyz)
else:
raise Exception("unrecognized prepared trajectory."
"NOTE: LPRMSD currently unsupported. Email schwancr@stanford.edu")
return ptraj
[docs]def unconcatenate_trajectory(trajectory, lengths):
"""Take a single trajectory that was created by concatenating seperate
trajectories and unconcenatenate it, returning the original trajectories.
You have to supply the lengths of the original trajectories.
Parameters
----------
trajectory : mdtraj.Trajectory
Long trajectory to be split
lengths : array_like
list of lengths to split the long trajectory into
Returns
-------
A list of trajectories
"""
return split(trajectory, lengths)
def split(longlist, lengths):
"""Split a long list into segments
Parameters
----------
longlist : array_like
Long trajectory to be split
lengths : array_like
list of lengths to split the long list into
Returns
-------
A list of lists
"""
if not sum(lengths) == len(longlist):
raise Exception('sum(lengths)=%s, len(longlist)=%s' % (sum(lengths), len(longlist)))
def func(x):
length, cumlength = x
return longlist[cumlength - length: cumlength]
output = [func(elem) for elem in zip(lengths, np.cumsum(lengths))]
return output
[docs]def stochastic_subsample(trajectories, shrink_multiple):
"""Randomly subsample from a trajectory
Given a list of trajectories, return a single trajectory
shrink_multiple times smaller than the total number of frames in
trajectories taken by random sampling of frames from trajectories
Parameters
----------
trajectories : list of mdtraj.Trajectory
list of trajectories to sample from
shrink_multiple : int
fraction to shrint by
Note that this method will modify the trajectory objects that you pass in
@CHECK is the note above actually true?
"""
shrink_multiple = int(shrink_multiple)
if shrink_multiple < 1:
raise ValueError('Shrink multiple should be an integer greater '
'than 1. You supplied %s' % shrink_multiple)
elif shrink_multiple == 1:
# if isinstance(trajectories, Trajectory):
# return trajectories
# return concatenate_trajectories(trajectories)
return trajectories
if isinstance(trajectories, md.Trajectory):
traj = trajectories
length = traj.n_frames
new_length = int(length / shrink_multiple)
if new_length <= 0:
return None
indices = np.array(random.sample(np.arange(length), new_length))
new_traj = traj[indices, :, :]
return new_traj
else:
# assume we have a list of trajectories
# check that all trajectories have the same number of atoms
num_atoms = np.array([traj.n_atoms for traj in trajectories])
if not np.all(num_atoms == num_atoms[0]):
raise Exception('Not all same # atoms')
# shrink each trajectory
subsampled = [stochastic_subsample(traj, shrink_multiple) for traj in trajectories]
# filter out failures
subsampled = [a for a in subsampled if a is not None]
return concatenate_trajectories(subsampled)
[docs]def deterministic_subsample(trajectories, stride, start=0):
"""Given a list of trajectories, return a single trajectory
shrink_multiple times smaller than the total number of frames in
trajectories by taking every "stride"th frame, starting from "start"
Note that this method will modify the trajectory objects that you pass in
Parameters
----------
trajectories : list of mdtraj.Trajectory
trajectories to subsample from
stride : int
freq to subsample at
start : int
first frame to pick
Returns
-------
trajectory : mdtraj.trajectory
shortened trajectory
"""
stride = int(stride)
if stride < 1:
raise ValueError('stride should be an integer greater than 1. You supplied %s' % stride)
elif stride == 1:
# if isinstance(trajectories, Trajectory):
# return trajectories
# return concatenate_trajectories(trajectories)
return trajectories
if isinstance(trajectories, Trajectory):
traj = trajectories
traj = traj[start::stride]
return traj
else:
# assume we have a list of trajectories
# check that all trajectories have the same number of atoms
num_atoms = np.array([traj.n_atoms for traj in trajectories])
if not np.all(num_atoms == num_atoms[0]):
raise Exception('Not all same # atoms')
# shrink each trajectory
strided = [deterministic_subsample(traj, stride, start) for traj in trajectories]
return concatenate_trajectories(strided)
[docs]def p_norm(data, p=2):
"""p_norm of an ndarray with XYZ coordinates
Parameters
----------
data : ndarray
XYZ coordinates. TODO: Shape?
p : {int, "max"}, optional
power of p_norm
Returns
-------
value : float
the answer
"""
if p == "max":
return data.max()
else:
p = float(p)
n = float(data.shape[0])
return ((data ** p).sum() / n) ** (1.0 / p)
#####################################################################
# #
# End Helper Functions #
# Begin Clustering Function #
# #
#####################################################################
def _assign(metric, ptraj, generator_indices):
"""Assign the frames in ptraj to the centers with indices *generator_indices*
Parameters
----------
metric : msmbuilder.metrics.AbstractDistanceMetric
A metric capable of handling `ptraj`
ptraj : prepared trajectory
ptraj return by the action of the preceding metric on a msmbuilder trajectory
generator_indices : array_like
indices (with respect to ptraj) of the frames to be considered the
cluster centers.
Returns
-------
assignments : ndarray
`assignments[i] = j` means that the `i`th frame in ptraj is assigned to
`ptraj[j]`
distances : ndarray
`distances[i] = j` means that the distance (according to `metric`) from
`ptraj[i]` to `ptraj[assignments[i]]` is `j`
"""
assignments = np.zeros(len(ptraj), dtype='int')
distances = np.inf * np.ones(len(ptraj), dtype='float32')
for m in generator_indices:
d = metric.one_to_all(ptraj, ptraj, m)
closer = np.where(d < distances)[0]
distances[closer] = d[closer]
assignments[closer] = m
return assignments, distances
def _kcenters(metric, ptraj, k=None, distance_cutoff=None, seed=0, verbose=True):
"""Run kcenters clustering algorithm.
Terminates either when `k` clusters have been identified, or when every data
is clustered better than `distance_cutoff`.
Parameters
----------
metric : msmbuilder.metrics.AbstractDistanceMetric
A metric capable of handling `ptraj`
ptraj : prepared trajectory
ptraj return by the action of the preceding metric on a msmbuilder trajectory
k : {int, None}
number of desired clusters, or None
distance_cutoff : {float, None}
Stop identifying new clusters once the distance of every data to its
cluster center falls below this value. Supply either this or `k`
seed : int, optional
index of the frame to use as the first cluster center
verbose : bool, optional
print as each new generator is found
Returns
-------
generator_indices : ndarray
indices (with respect to ptraj) of the frames to be considered cluster centers
assignments : ndarray
the cluster center to which each frame is assigned to (1D)
distances : ndarray
distance from each of the frames to the cluster center it was assigned to
See Also
--------
KCenters : wrapper around this implementation that provides more convenience
Notes
------
the assignments are numbered with respect to the position in ptraj of the
generator, not the position in generator_indices. That is, assignments[10] =
1020 means that the 10th simulation frame is assigned to the 1020th
simulation frame, not to the 1020th generator.
References
----------
.. [1] Beauchamp, MSMBuilder2
"""
if k is None and distance_cutoff is None:
raise ValueError("I need some cutoff criterion! both k and "
"distance_cutoff can't both be none")
if k is None and distance_cutoff <= 0:
raise ValueError("With k=None you need to supply a legit distance_cutoff")
if distance_cutoff is None:
# set it below anything that can ever be reached
distance_cutoff = -1
if k is None:
k = sys.maxsize
distance_list = np.inf * np.ones(len(ptraj), dtype=np.float32)
assignments = -1 * np.ones(len(ptraj), dtype=np.int32)
generator_indices = []
for i in xrange(k):
new_ind = seed if i == 0 else np.argmax(distance_list)
if k == sys.maxsize:
logger.info("K-centers: Finding generator %i. Will finish when % .4f drops below % .4f", i, distance_list[new_ind], distance_cutoff)
else:
logger.info("K-centers: Finding generator %i", i)
if distance_list[new_ind] < distance_cutoff:
break
new_distance_list = metric.one_to_all(ptraj, ptraj, new_ind)
updated_indices = np.where(new_distance_list < distance_list)[0]
distance_list[updated_indices] = new_distance_list[updated_indices]
assignments[updated_indices] = new_ind
generator_indices.append(new_ind)
if verbose:
logger.info('KCenters found %d generators', i + 1)
return np.array(generator_indices), assignments, distance_list
def _clarans(metric, ptraj, k, num_local_minima, max_neighbors, local_swap=True, initial_medoids='kcenters', initial_assignments=None, initial_distance=None, verbose=True):
"""Run the CLARANS clustering algorithm on the frames in a trajectory
Reference
---------
.. [1] Ng, R.T, Jan, Jiawei, 'CLARANS: A Method For Clustering Objects For
Spatial Data Mining', IEEE Trans. on Knowledge and Data Engineering, vol. 14
no.5 pp. 1003-1016 Sep/Oct 2002
http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1033770
Parameters
----------
metric : msmbuilder.metrics.AbstractDistanceMetric
A metric capable of handling `ptraj`
ptraj : prepared trajectory
ptraj return by the action of the preceding metric on a msmbuilder trajectory
k : int
number of desired clusters
num_local_minima : int
number of local minima in the set of all possible clusterings to identify.
Execution time will scale linearly with this parameter. The best of
these local minima will be returned.
max_neighbors : int
number of rejected swaps in a row necessary to declare a proposed
clustering a local minima
local_swap : bool, optional
If true, proposed swaps will be between a medoid and a data point
currently assigned to that medoid. If false, the data point for
the proposed swap is selected randomly.
initial_medoids : {'kcenters', 'random', ndarray}, optional
If 'kcenters', run kcenters clustering first to get the initial medoids,
and then run the swaps to improve it. If 'random', select the medoids at
random. Otherwise, initial_medoids should be a numpy array of the
indices of the medoids.
initial_assignments : {None, ndarray}, optional
If None, initial_assignments will be computed based on the
initial_medoids. If you pass in your own initial_medoids, you can also
pass in initial_assignments to avoid recomputing them.
initial_distances : {None, ndarray}, optional
If None, initial_distances will be computed based on the initial_medoids.
If you pass in your own initial_medoids, you can also pass in
initial_distances to avoid recomputing them.
verbose : bool, optional
Print information about the swaps being attempted
Returns
-------
generator_indices : ndarray
indices (with respect to ptraj) of the frames to be considered cluster centers
assignments : ndarray
the cluster center to which each frame is assigned to (1D)
distances : ndarray
distance from each of the frames to the cluster center it was assigned to
"""
num_frames = len(ptraj)
if initial_medoids == 'kcenters':
initial_medoids, initial_assignments, initial_distance = _kcenters(metric, ptraj, k)
elif initial_medoids == 'random':
initial_medoids = np.random.permutation(np.arange(num_frames))[0:k]
initial_assignments, initial_distance = _assign(metric, ptraj, initial_medoids)
else:
if not isinstance(initial_medoids, np.ndarray):
raise ValueError('Initial medoids should be a numpy array')
if initial_assignments is None or initial_distance is None:
initial_assignments, initial_distance = _assign(metric, ptraj, initial_medoids)
if not len(initial_assignments) == num_frames:
raise ValueError('Initial assignments is not the same length as ptraj')
if not len(initial_distance) == num_frames:
raise ValueError('Initial distance is not the same length as ptraj')
if not k == len(initial_medoids):
raise ValueError('Initial medoids not the same length as k')
initial_pmedoids = ptraj[initial_medoids]
initial_cost = np.sum(initial_distance)
min_cost = initial_cost
# these iterations could be parallelized
for i in xrange(num_local_minima):
logger.info('%s of %s local minima', i, num_local_minima)
# the cannonical clarans approach is to initialize the medoids that you
# start from randomly, but instead we use the kcenters medoids.
medoids = initial_medoids
pmedoids = initial_pmedoids
assignments = initial_assignments
distance_to_current = initial_distance
current_cost = initial_cost
optimal_medoids = initial_medoids
optimal_assignments = initial_assignments
optimal_distances = initial_distance
# loop over neighbors
j = 0
while j < max_neighbors:
medoid_i = np.random.randint(k)
old_medoid = medoids[medoid_i]
if local_swap is False:
trial_medoid = np.random.randint(num_frames)
else:
trial_medoid = random.choice(np.where(assignments == medoids[medoid_i])[0])
new_medoids = medoids.copy()
new_medoids[medoid_i] = trial_medoid
pmedoids = ptraj[new_medoids]
if type(pmedoids) == np.ndarray:
pmedoids = pmedoids.copy()
new_distances = distance_to_current.copy()
new_assignments = assignments.copy()
logger.info('swapping %s for %s...', old_medoid, trial_medoid)
distance_to_trial = metric.one_to_all(ptraj, ptraj, trial_medoid)
assigned_to_trial = np.where(distance_to_trial < distance_to_current)[0]
new_assignments[assigned_to_trial] = trial_medoid
new_distances[assigned_to_trial] = distance_to_trial[assigned_to_trial]
ambiguous = np.where((new_assignments == old_medoid) & \
(distance_to_trial >= distance_to_current))[0]
for l in ambiguous:
if len(ptraj) <= l:
logger.error(len(ptraj))
logger.error(l)
logger.error(ptraj.dtype)
logger.error(l.dtype)
d = metric.one_to_all(ptraj, pmedoids, l)
argmin = np.argmin(d)
new_assignments[l] = new_medoids[argmin]
new_distances[l] = d[argmin]
new_cost = np.sum(new_distances)
if new_cost < current_cost:
logger.info('Accept')
medoids = new_medoids
assignments = new_assignments
distance_to_current = new_distances
current_cost = new_cost
j = 0
else:
j += 1
logger.info('Reject')
if current_cost < min_cost:
min_cost = current_cost
optimal_medoids = medoids.copy()
optimal_assignments = assignments.copy()
optimal_distances = distance_to_current.copy()
return optimal_medoids, optimal_assignments, optimal_distances
def _clarans_helper(args):
return _clarans(*args)
def _hybrid_kmedoids(metric, ptraj, k=None, distance_cutoff=None, num_iters=10, local_swap=True, norm_exponent=2.0, too_close_cutoff=0.0001, ignore_max_objective=False, initial_medoids='kcenters', initial_assignments=None, initial_distance=None):
"""Run the hybrid kmedoids clustering algorithm to cluster a trajectory
References
----------
.. [1] Beauchamp, K. MSMBuilder2
Parameters
----------
metric : msmbuilder.metrics.AbstractDistanceMetric
A metric capable of handling `ptraj`
ptraj : prepared trajectory
ptraj return by the action of the preceding metric on a msmbuilder trajectory
k : int
number of desired clusters
num_iters : int
number of swaps to attempt per medoid
local_swap : boolean, optional
If true, proposed swaps will be between a medoid and a data point
currently assigned to that medoid. If false, the data point for the
proposed swap is selected randomly.
norm_exponent : float, optional
exponent to use in pnorm of the distance to generate objective function
too_close_cutoff : float, optional
Summarily reject proposed swaps if the distance of the medoid to the trial
medoid is less than thus value
ignore_max_objective : boolean, optional
Ignore changes to the distance of the worst classified point, and only
reject or accept swaps based on changes to the p norm of all the data
points.
initial_medoids : {'kcenters', ndarray}
If 'kcenters', run kcenters clustering first to get the initial medoids,
and then run the swaps to improve it. If 'random', select the medoids at
random. Otherwise, initial_medoids should be a numpy array of the
indices of the medoids.
initial_assignments : {None, ndarray}, optional
If None, initial_assignments will be computed based on the
initial_medoids. If you pass in your own initial_medoids, you can also
pass in initial_assignments to avoid recomputing them.
initial_distances : {None, ndarray}, optional
If None, initial_distances will be computed based on the initial_medoids.
If you pass in your own initial_medoids, you can also pass in
initial_distances to avoid recomputing them.
"""
if k is None and distance_cutoff is None:
raise ValueError("I need some cutoff criterion! both k and distance_cutoff can't both be none")
if k is None and distance_cutoff <= 0:
raise ValueError("With k=None you need to supply a legit distance_cutoff")
if distance_cutoff is None:
# set it below anything that can ever be reached
distance_cutoff = -1
num_frames = len(ptraj)
if initial_medoids == 'kcenters':
initial_medoids, initial_assignments, initial_distance = _kcenters(metric, ptraj, k, distance_cutoff)
elif initial_medoids == 'random':
if k is None:
raise ValueError('You need to supply the number of clusters, k, you want')
initial_medoids = np.random.permutation(np.arange(num_frames))[0:k]
initial_assignments, initial_distance = _assign(metric, ptraj, initial_medoids)
else:
if not isinstance(initial_medoids, np.ndarray):
raise ValueError('Initial medoids should be a numpy array')
if initial_assignments is None or initial_distance is None:
initial_assignments, initial_distance = _assign(metric, ptraj, initial_medoids)
assignments = initial_assignments
distance_to_current = initial_distance
medoids = initial_medoids
pgens = ptraj[medoids]
k = len(initial_medoids)
obj_func = p_norm(distance_to_current, p=norm_exponent)
max_norm = p_norm(distance_to_current, p='max')
if not np.all(np.unique(medoids) == np.sort(medoids)):
raise ValueError('Initial medoids must be distinct')
if not np.all(np.unique(assignments) == sorted(medoids)):
raise ValueError('Initial assignments dont match initial medoids')
for iteration in xrange(num_iters):
for medoid_i in xrange(k):
if not np.all(np.unique(assignments) == sorted(medoids)):
raise ValueError('Loop invariant lost')
if local_swap is False:
trial_medoid = np.random.randint(num_frames)
else:
trial_medoid = random.choice(np.where(assignments == medoids[medoid_i])[0])
old_medoid = medoids[medoid_i]
if old_medoid == trial_medoid:
continue
new_medoids = medoids.copy()
new_medoids[medoid_i] = trial_medoid
pmedoids = ptraj[new_medoids]
new_distances = distance_to_current.copy()
new_assignments = assignments.copy()
logger.info('Sweep %d, swapping medoid %d (conf %d) for conf %d...', iteration, medoid_i, old_medoid, trial_medoid)
distance_to_trial = metric.one_to_all(ptraj, ptraj, trial_medoid)
if not np.all(np.isfinite(distance_to_trial)):
raise ValueError('distance metric returned nonfinite distances')
if distance_to_trial[old_medoid] < too_close_cutoff:
logger.info('Too close')
continue
assigned_to_trial = np.where(distance_to_trial < distance_to_current)[0]
new_assignments[assigned_to_trial] = trial_medoid
new_distances[assigned_to_trial] = distance_to_trial[assigned_to_trial]
ambiguous = np.where((new_assignments == old_medoid) & \
(distance_to_trial >= distance_to_current))[0]
for l in ambiguous:
d = metric.one_to_all(ptraj, pmedoids, l)
if not np.all(np.isfinite(d)):
raise ValueError('distance metric returned nonfinite distances')
argmin = np.argmin(d)
new_assignments[l] = new_medoids[argmin]
new_distances[l] = d[argmin]
new_obj_func = p_norm(new_distances, p=norm_exponent)
new_max_norm = p_norm(new_distances, p='max')
if new_obj_func < obj_func and (new_max_norm <= max_norm or ignore_max_objective is True):
logger.info("Accept. New f = %f, Old f = %f", new_obj_func, obj_func)
medoids = new_medoids
assignments = new_assignments
distance_to_current = new_distances
obj_func = new_obj_func
max_norm = new_max_norm
else:
logger.info("Reject. New f = %f, Old f = %f", new_obj_func, obj_func)
return medoids, assignments, distance_to_current
#####################################################################
# #
# End Clustering Functions #
# Begin Clustering Classes #
# #
#####################################################################
[docs]class Hierarchical(object):
allowable_methods = ['single', 'complete', 'average', 'weighted',
'centroid', 'median', 'ward']
[docs] def __init__(self, metric, trajectories, method='single', precomputed_values=None):
"""Initialize a hierarchical clusterer using the supplied distance
metric and method.
Method should be one of the fastcluster linkage methods,
namely 'single', 'complete', 'average', 'weighted', 'centroid', 'median',
or 'ward'.
Parameters
----------
metric : msmbuilder.metrics.AbstractDistanceMetric
A metric capable of handling `ptraj`
trajectory : Trajectory list of Trajectorys
data to cluster
method : {'single', 'complete', 'average', 'weighted', 'centroid',
'median', 'ward'}
precomputed_values :
used internally to implement load_from_disk()
Notes
-----
This is implemenred with the fastcluster library, which can be downloaded
from CRAN http://cran.r-project.org/web/packages/fastcluster/
"""
if precomputed_values is not None:
precomputed_z_matrix, traj_lengths = precomputed_values
if isinstance(precomputed_z_matrix, np.ndarray) and precomputed_z_matrix.shape[1] == 4:
self.Z = precomputed_z_matrix
self.traj_lengths = traj_lengths
return
else:
raise Exception('Something is wrong')
if not isinstance(metric, metrics.AbstractDistanceMetric):
raise TypeError('%s is not an abstract distance metric' % metric)
if not method in self.allowable_methods:
raise ValueError("%s not in %s" % (method, str(self.allowable_methods)))
if isinstance(trajectories, md.Trajectory):
trajectories = [trajectories]
elif isinstance(trajectories, types.GeneratorType):
trajectories = list(trajectories)
self.traj_lengths = np.array([len(t) for t in trajectories])
# self.ptrajs = [self.metric.prepare_trajectory(traj) for traj in self.trajectories]
logger.info('Preparing...')
flat_trajectory = concatenate_trajectories(trajectories)
pflat_trajectory = metric.prepare_trajectory(flat_trajectory)
logger.info('Getting all to all pairwise distance matrix...')
dmat = metric.all_pairwise(pflat_trajectory)
logger.info('Done with all2all')
self.Z = fastcluster.linkage(dmat, method=method, preserve_input=False)
logger.info('Got Z matrix')
# self.Z = scipy.cluster.hierarchy.linkage(dmat, method=method)
def _oneD_assignments(self, k=None, cutoff_distance=None):
"""Assign the frames into clusters.
Either supply k, the number of clusters desired, or cutoff_distance, a
max diameteric of each cluster
Parameters
----------
k : int, optional
number of clusters desired
cutoff_distance : float, optional
max diameter of each cluster, as a cutoff
Returns
-------
assignments_1d : ndarray
1D array with the assignments of the flattened trajectory (internal).
See Also
--------
Hierarchical.get_assignments
"""
# note that we subtract 1 from the results given by fcluster since
# they start with 1-based numbering, but we want the lowest index cluster
# to be number 0
if k is not None and cutoff_distance is not None:
raise Exception('You cant supply both a k and cutoff distance')
elif k is not None:
return scipy.cluster.hierarchy.fcluster(self.Z, k, criterion='maxclust') - 1
elif cutoff_distance is not None:
return scipy.cluster.hierarchy.fcluster(self.Z, cutoff_distance, criterion='distance') - 1
else:
raise Exception('You need to supply either k or a cutoff distance')
def get_assignments(self, k=None, cutoff_distance=None):
"""Assign the frames into clusters.
Either supply k, the number of clusters desired, or cutoff_distance, a
max diameteric of each cluster
Parameters
----------
k : int, optional
number of clusters desired
cutoff_distance : float, optional
max diameter of each cluster, as a cutoff
Returns
-------
assignments : ndarray
2D array of shape num_trajs x length of longest traj. Padded with -1s
at the end if not all trajectories are the same length
"""
assgn_list = split(self._oneD_assignments(k, cutoff_distance), self.traj_lengths)
output = -1 * np.ones((len(self.traj_lengths), max(self.traj_lengths)), dtype='int')
for i, traj_assign in enumerate(assgn_list):
output[i][0:len(traj_assign)] = traj_assign
return output
def save_to_disk(self, filename):
"""Save this clusterer to disk.
This is useful because computing the Z-matrix
(done in __init__) is the most expensive part, and assigning is cheap
Parameters
----------
filename : str
location to save to
Raises
------
Exception if something already exists at `filename`
"""
io.saveh(filename, z_matrix=self.Z, traj_lengths=self.traj_lengths)
@classmethod
def load_from_disk(cls, filename):
"""Load up a clusterer from disk
This is useful because computing the Z-matrix
(done in __init__) is the most expensive part, and assigning is cheap
Parameters
----------
filename : str
location to save to
Raises
------
TODO: Probablt raises something if filename doesn't exist?
"""
data = io.loadh(filename, deferred=False)
Z, traj_lengths = data['z_matrix'], data['traj_lengths']
# Next two lines are a hack to fix Serializer bug. KAB
if np.rank(traj_lengths) == 0:
traj_lengths = [traj_lengths]
return cls(None, None, precomputed_values=(Z, traj_lengths))
[docs]class BaseFlatClusterer(object):
"""
(Abstract) base class / mixin that Clusterers can extend. Provides convenience
functions for the user.
To implement a clusterer using this base class, subclass it and define your
init method to do the clustering you want, and then set self._generator_indices,
self._assignments, and self._distances with the result.
For convenience (and to enable some of its functionality), let BaseFlatCluster
prepare the trajectories for you by calling BaseFlatClusterer's __init__ method
and then using the prepared, concatenated trajectory self.ptraj for your clustering.
"""
[docs] def __init__(self, metric, trajectories=None, prep_trajectories=None):
if not isinstance(metric, metrics.AbstractDistanceMetric):
raise TypeError('%s is not an AbstractDistanceMetric' % metric)
# if we got a single trajectory instead a list of trajectories, make it a
# list
if not trajectories is None:
if isinstance(trajectories, md.Trajectory):
trajectories = [trajectories]
elif isinstance(trajectories, types.GeneratorType):
trajectories = list(trajectories)
self._concatenated = concatenate_trajectories(trajectories)
if prep_trajectories is None:
self.ptraj = metric.prepare_trajectory(self._concatenated)
self._traj_lengths = [len(traj) for traj in trajectories]
if not prep_trajectories is None: # If they also provide trajectories
# that's fine, but we will use the prep_trajectories
if isinstance(prep_trajectories, np.ndarray) or \
isinstance(prep_trajectories[0], np.ndarray):
prep_trajectories = np.array(prep_trajectories)
if len(prep_trajectories.shape) == 2:
prep_trajectories = [prep_trajectories]
else:
# 3D means a list of prep_trajectories was input
prep_trajectories = list(prep_trajectories)
if trajectories is None:
self._traj_lengths = [len(ptraj) for ptraj in prep_trajectories]
self._concatenated = None
self.ptraj = concatenate_prep_trajectories(prep_trajectories, metric)
if trajectories is None and prep_trajectories is None:
raise Exception("must provide at least one of trajectories and prep_trajectories")
self._metric = metric
self.num_frames = sum(self._traj_lengths)
# All the actual Clusterer objects that subclass this base class
# need to calculate these three parameters and store them here
# self._generator_indices[i] = j means that the jth frame of self.ptraj is
# considered a generator. self._assignments[i] = j should indicate that
# self.ptraj[j] is the coordinates of the the cluster center corresponding to i
# and self._distances[i] = f should indicate that the distance from self.ptraj[i]
# to self.ptraj[self._assignments[i]] is f.
self._generator_indices = 'abstract'
self._assignments = 'abstract'
self._distances = 'abstract'
def _ensure_generators_computed(self):
if self._generator_indices == 'abstract':
raise Exception('Your subclass of BaseFlatClusterer is implemented wrong and didnt compute self._generator_indicies.')
def _ensure_assignments_and_distances_computed(self):
if self._assignments == 'abstract' or self._distances == 'abstract':
self._assignments, self._distances = _assign(self._metric, self.ptraj, self._generator_indices)
def get_assignments(self):
"""Assign the trajectories you passed into the constructor based on
generators that have been identified
Returns
-------
assignments : ndarray
2D array of assignments where k = assignments[i,j] means that the
jth frame in the ith trajectory is assigned to the center whose
coordinates are in the kth frame of the trajectory in
get_generators_as_traj()
"""
self._ensure_generators_computed()
self._ensure_assignments_and_distances_computed()
twoD = split(self._assignments, self._traj_lengths)
# the numbers in self._assignments are indices with respect to self.ptraj,
# but we want indices with respect to the number in the trajectory of generators
# returned by get_generators_as_traj()
ptraj_index_to_gens_traj_index = np.zeros(self.num_frames)
for i, g in enumerate(self._generator_indices):
ptraj_index_to_gens_traj_index[g] = i
# put twoD into a rectangular array
output = -1 * np.ones((len(self._traj_lengths), max(self._traj_lengths)), dtype=np.int32)
for i, traj_assign in enumerate(twoD):
output[i, 0:len(traj_assign)] = ptraj_index_to_gens_traj_index[traj_assign]
return output
def get_distances(self):
"""Extract the distance from each frame to its assigned cluster kcenter
Returns
-------
distances : ndarray
2D array of size num_trajs x length of longest traj, such that
distances[i,j] gives the distance from the ith trajectorys jth
frame to its assigned cluster center
"""
self._ensure_generators_computed()
self._ensure_assignments_and_distances_computed()
twoD = split(self._distances, self._traj_lengths)
# put twoD into a rectangular array
output = -1 * np.ones((len(self._traj_lengths), max(self._traj_lengths)), dtype='float32')
for i, traj_distances in enumerate(twoD):
output[i][0:len(traj_distances)] = traj_distances
return output
def get_generators_as_traj(self):
"""Get a trajectory containing the generators
Returns
-------
traj or ptraj : msmbuilder.Trajectory or np.ndarray
a trajectory object where each frame is one of the
generators/medoids identified. If trajectories was
not originally provided, then will only return the
prepared generators
"""
self._ensure_generators_computed()
if self._concatenated is None:
output = self.ptraj[self._generator_indices]
else:
output = self._concatenated[self._generator_indices]
return output
def get_generator_indices(self):
"""Get the generator indices corresponding to frames in
self.ptraj.
Returns
-------
gen_inds : np.ndarray
generator indices corresponding to the generators in
self.ptraj
"""
return self._generator_indices
[docs]class KCenters(BaseFlatClusterer):
[docs] def __init__(self, metric, trajectories=None, prep_trajectories=None,
k=None, distance_cutoff=None, seed=0):
"""Run kcenters clustering algorithm.
Terminates either when `k` clusters have been identified, or when every data
is clustered better than `distance_cutoff`.
Parameters
----------
metric : msmbuilder.metrics.AbstractDistanceMetric
A metric capable of handling `ptraj`
trajectory : Trajectory or list of msmbuilder.Trajectory
data to cluster
k : {int, None}
number of desired clusters, or None
distance_cutoff : {float, None}
Stop identifying new clusters once the distance of every data to its
cluster center falls below this value. Supply either this or `k`
seed : int, optional
index of the frame to use as the first cluster center
See Also
--------
_kcenters : implementation
References
----------
.. [1] Beauchamp, MSMBuilder2
"""
s = super(KCenters, self) if PY2 else super()
s.__init__(metric, trajectories, prep_trajectories)
gi, asgn, dl = _kcenters(metric, self.ptraj, k, distance_cutoff, seed)
# note that the assignments here are with respect to the numbering
# in the trajectory -- they are not contiguous. Using the get_assignments()
# method defined on the superclass (BaseFlatClusterer) will convert them
# back into the contiguous numbering scheme (with respect to position in the
# self._generator_indices).
self._generator_indices = gi
self._assignments = asgn
self._distances = dl
[docs]class Clarans(BaseFlatClusterer):
[docs] def __init__(self, metric, trajectories=None, prep_trajectories=None, k=None,
num_local_minima=10, max_neighbors=20, local_swap=False):
"""Run the CLARANS clustering algorithm on the frames in a trajectory
Reference
---------
.. [1] Ng, R.T, Jan, Jiawei, 'CLARANS: A Method For Clustering Objects For
Spatial Data Mining', IEEE Trans. on Knowledge and Data Engineering, vol. 14
no.5 pp. 1003-1016 Sep/Oct 2002
http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1033770
Parameters
----------
metric : msmbuilder.metrics.AbstractDistanceMetric
A metric capable of handling `ptraj`
trajectory : Trajectory or list of msmbuilder.Trajectory
data to cluster
k : int
number of desired clusters
num_local_minima : int
number of local minima in the set of all possible clusterings to identify.
Execution time will scale linearly with this parameter. The best of
these local minima will be returned.
max_neighbors : int
number of rejected swaps in a row necessary to declare a proposed
clustering a local minima
local_swap : bool, optional
If true, proposed swaps will be between a medoid and a data point
currently assigned to that medoid. If false, the data point for
the proposed swap is selected randomly.
See Also
--------
_kcenters : implementation
SubsampledClarans : random subsampling version (faster)
"""
s = super(Clarans, self) if PY2 else super()
s.__init__(metric, trajectories, prep_trajectories)
medoids, assignments, distances = _clarans(metric, self.ptraj, k,
num_local_minima, max_neighbors, local_swap, initial_medoids='kcenters')
self._generator_indices = medoids
self._assignments = assignments
self._distances = distances
class SubsampledClarans(BaseFlatClusterer):
def __init__(self, metric, trajectories=None, prep_trajectories=None, k=None,
num_samples=None, shrink_multiple=None, num_local_minima=10,
max_neighbors=20, local_swap=False, parallel=None):
""" Run the CLARANS algorithm (see the Clarans class for more description) on
multiple subsamples of the data drawn randomly.
Parameters
----------
metric : msmbuilder.metrics.AbstractDistanceMetric
A metric capable of handling `ptraj`
trajectories : Trajectory or list of msmbuilder.Trajectory
data to cluster
prep_trajectories : np.ndarray or None
prepared trajectories instead of msmbuilder.Trajectory
k : int
number of desired clusters
num_samples : int
number of random subsamples to draw
shrink_multiple : int
Each of the subsamples drawn will be of size equal to the total
number of frames divided by this number
num_local_minima : int, optional
number of local minima in the set of all possible clusterings
to identify. Execution time will scale linearly with this
parameter. The best of these local minima will be returned.
max_neighbors : int, optional
number of rejected swaps in a row necessary to declare a proposed
clustering a local minima
local_swap : bool, optional
If true, proposed swaps will be between a medoid and a data point
currently assigned to that medoid. If false, the data point for
the proposed swap is selected randomly
parallel : {None, 'multiprocessing', 'dtm}
Which parallelization library to use. Each of the random subsamples
are run independently
"""
s = super(SubsampledClarans, self) if PY2 else super()
s.__init__(metric, trajectories, prep_trajectories)
if parallel is None:
mymap = map
elif parallel == 'multiprocessing':
mymap = Pool().map
elif parallel == 'dtm':
mymap = dtm.map
else:
raise ValueError('Unrecognized parallelization')
# function that returns a list of random indices
gen_sub_indices = lambda: np.array(random.sample(np.arange(self.num_frames), self.num_frames / shrink_multiple))
# gen_sub_indices = lambda: np.arange(self.num_frames)
sub_indices = [gen_sub_indices() for i in range(num_samples)]
ptrajs = [self.ptraj[sub_indices[i]] for i in range(num_samples)]
clarans_args = uneven_zip(metric, ptrajs, k, num_local_minima, max_neighbors, local_swap, ['kcenters'], None, None, False)
results = mymap(_clarans_helper, clarans_args)
medoids_list, assignments_list, distances_list = list(zip(*results))
best_i = np.argmin([np.sum(d) for d in distances_list])
# print 'best i', best_i
# print 'best medoids (relative to subindices)', medoids_list[best_i]
# print 'sub indices', sub_indices[best_i]
# print 'best_medoids', sub_indices[best_i][medoids_list[best_i]]
self._generator_indices = sub_indices[best_i][medoids_list[best_i]]
[docs]class HybridKMedoids(BaseFlatClusterer):
[docs] def __init__(self, metric, trajectories=None, prep_trajectories=None, k=None,
distance_cutoff=None, local_num_iters=10, global_num_iters=0,
norm_exponent=2.0, too_close_cutoff=.0001, ignore_max_objective=False):
"""Run the hybrid kmedoids clustering algorithm on a set of trajectories
Parameters
----------
metric : msmbuilder.metrics.AbstractDistanceMetric
A metric capable of handling `ptraj`
trajectory : Trajectory or list of msmbuilder.Trajectory
data to cluster
k : int
number of desired clusters
num_iters : int
number of swaps to attempt per medoid
local_swap : boolean, optional
If true, proposed swaps will be between a medoid and a data point
currently assigned to that medoid. If false, the data point for the
proposed swap is selected randomly.
norm_exponent : float, optional
exponent to use in pnorm of the distance to generate objective function
too_close_cutoff : float, optional
Summarily reject proposed swaps if the distance of the medoid to the trial
medoid is less than thus value
ignore_max_objective : boolean, optional
Ignore changes to the distance of the worst classified point, and only
reject or accept swaps based on changes to the p norm of all the data
points.
References
----------
.. [1] Beauchamp, K, et. al. MSMBuilder2
See Also
--------
KCenters : faster, less accurate
Clarans : slightly more clever termination criterion
"""
s = super(HybridKMedoids, self) if PY2 else super()
s.__init__(metric, trajectories, prep_trajectories)
medoids, assignments, distances = _hybrid_kmedoids(
metric, self.ptraj, k, distance_cutoff, local_num_iters, True, norm_exponent,
too_close_cutoff, ignore_max_objective, initial_medoids='kcenters')
if global_num_iters != 0:
medoids, assignments, distances = _hybrid_kmedoids(
metric, self.ptraj, k, distance_cutoff, global_num_iters, False, norm_exponent,
too_close_cutoff, ignore_max_objective, medoids, assignments, distances)
self._generator_indices = medoids
self._assignments = assignments
self._distances = distances