Source code for msmbuilder.clustering

from __future__ import print_function, division, absolute_import
from mdtraj.utils.six import PY2
from mdtraj.utils.six.moves import xrange
import sys
import types
import random
import numpy as np
try:
    import fastcluster
except ImportError:
    pass
import scipy.cluster.hierarchy

import mdtraj as md

from msmbuilder import metrics
from mdtraj import io
from msmbuilder.utils import uneven_zip

from multiprocessing import Pool
try:
    from deap import dtm  # has parallel map() implementation via mpi
except:
    pass

import logging
logger = logging.getLogger(__name__)

#####################################################################
#                                                                   #
#                       Begin Helper Functions                      #
#                                                                   #
#####################################################################


[docs]def concatenate_trajectories(trajectories): """Concatenate a list of trajectories into a single long trajectory Parameters ---------- trajectories : list list of mdtraj.Trajectory object Returns ------- concat_traj : mdtraj.Trajectory """ assert len(trajectories) > 0, 'Please supply a list of trajectories' concat_traj = trajectories[0] for i in xrange(1, len(trajectories)): # Use mdtraj operator overloading concat_traj += trajectories[i] return concat_traj
[docs]def concatenate_prep_trajectories(prep_trajectories, metric): """Concatenate a list of prepared trajectories and create a single prepared_trajectory. This is non-trivial because the RMSD and LPRMSD prepared trajectories are not np.ndarrays ... Parameters ---------- prep_trajectories : list list of prepared trajectories metric : msmbuilder.metrics.AbstractDistance metric subclass instance metric used to prepare the trajectories. Needed for RMSD and LPRMSD since concatenation requires recreating the prepared trajectory Returns ------- ptraj : prepared_trajectory prepared trajectory instance, like that returned from metric.prepare_trajectory """ if isinstance(prep_trajectories[0], np.ndarray): ptraj = np.concatenate(prep_trajectories) elif isinstance(prep_trajectories[0], RMSD.TheoData): xyz = np.concatenate([p.XYZData[:, :, :p.NumAtoms] for p in prep_trajectories]) xyz = xyz.transpose((0, 2, 1)) ptraj = metric.TheoData(xyz) else: raise Exception("unrecognized prepared trajectory." "NOTE: LPRMSD currently unsupported. Email schwancr@stanford.edu") return ptraj
[docs]def unconcatenate_trajectory(trajectory, lengths): """Take a single trajectory that was created by concatenating seperate trajectories and unconcenatenate it, returning the original trajectories. You have to supply the lengths of the original trajectories. Parameters ---------- trajectory : mdtraj.Trajectory Long trajectory to be split lengths : array_like list of lengths to split the long trajectory into Returns ------- A list of trajectories """ return split(trajectory, lengths)
def split(longlist, lengths): """Split a long list into segments Parameters ---------- longlist : array_like Long trajectory to be split lengths : array_like list of lengths to split the long list into Returns ------- A list of lists """ if not sum(lengths) == len(longlist): raise Exception('sum(lengths)=%s, len(longlist)=%s' % (sum(lengths), len(longlist))) def func(x): length, cumlength = x return longlist[cumlength - length: cumlength] output = [func(elem) for elem in zip(lengths, np.cumsum(lengths))] return output
[docs]def stochastic_subsample(trajectories, shrink_multiple): """Randomly subsample from a trajectory Given a list of trajectories, return a single trajectory shrink_multiple times smaller than the total number of frames in trajectories taken by random sampling of frames from trajectories Parameters ---------- trajectories : list of mdtraj.Trajectory list of trajectories to sample from shrink_multiple : int fraction to shrint by Note that this method will modify the trajectory objects that you pass in @CHECK is the note above actually true? """ shrink_multiple = int(shrink_multiple) if shrink_multiple < 1: raise ValueError('Shrink multiple should be an integer greater ' 'than 1. You supplied %s' % shrink_multiple) elif shrink_multiple == 1: # if isinstance(trajectories, Trajectory): # return trajectories # return concatenate_trajectories(trajectories) return trajectories if isinstance(trajectories, md.Trajectory): traj = trajectories length = traj.n_frames new_length = int(length / shrink_multiple) if new_length <= 0: return None indices = np.array(random.sample(np.arange(length), new_length)) new_traj = traj[indices, :, :] return new_traj else: # assume we have a list of trajectories # check that all trajectories have the same number of atoms num_atoms = np.array([traj.n_atoms for traj in trajectories]) if not np.all(num_atoms == num_atoms[0]): raise Exception('Not all same # atoms') # shrink each trajectory subsampled = [stochastic_subsample(traj, shrink_multiple) for traj in trajectories] # filter out failures subsampled = [a for a in subsampled if a is not None] return concatenate_trajectories(subsampled)
[docs]def deterministic_subsample(trajectories, stride, start=0): """Given a list of trajectories, return a single trajectory shrink_multiple times smaller than the total number of frames in trajectories by taking every "stride"th frame, starting from "start" Note that this method will modify the trajectory objects that you pass in Parameters ---------- trajectories : list of mdtraj.Trajectory trajectories to subsample from stride : int freq to subsample at start : int first frame to pick Returns ------- trajectory : mdtraj.trajectory shortened trajectory """ stride = int(stride) if stride < 1: raise ValueError('stride should be an integer greater than 1. You supplied %s' % stride) elif stride == 1: # if isinstance(trajectories, Trajectory): # return trajectories # return concatenate_trajectories(trajectories) return trajectories if isinstance(trajectories, Trajectory): traj = trajectories traj = traj[start::stride] return traj else: # assume we have a list of trajectories # check that all trajectories have the same number of atoms num_atoms = np.array([traj.n_atoms for traj in trajectories]) if not np.all(num_atoms == num_atoms[0]): raise Exception('Not all same # atoms') # shrink each trajectory strided = [deterministic_subsample(traj, stride, start) for traj in trajectories] return concatenate_trajectories(strided)
[docs]def p_norm(data, p=2): """p_norm of an ndarray with XYZ coordinates Parameters ---------- data : ndarray XYZ coordinates. TODO: Shape? p : {int, "max"}, optional power of p_norm Returns ------- value : float the answer """ if p == "max": return data.max() else: p = float(p) n = float(data.shape[0]) return ((data ** p).sum() / n) ** (1.0 / p) ##################################################################### # # # End Helper Functions # # Begin Clustering Function # # # #####################################################################
def _assign(metric, ptraj, generator_indices): """Assign the frames in ptraj to the centers with indices *generator_indices* Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` ptraj : prepared trajectory ptraj return by the action of the preceding metric on a msmbuilder trajectory generator_indices : array_like indices (with respect to ptraj) of the frames to be considered the cluster centers. Returns ------- assignments : ndarray `assignments[i] = j` means that the `i`th frame in ptraj is assigned to `ptraj[j]` distances : ndarray `distances[i] = j` means that the distance (according to `metric`) from `ptraj[i]` to `ptraj[assignments[i]]` is `j` """ assignments = np.zeros(len(ptraj), dtype='int') distances = np.inf * np.ones(len(ptraj), dtype='float32') for m in generator_indices: d = metric.one_to_all(ptraj, ptraj, m) closer = np.where(d < distances)[0] distances[closer] = d[closer] assignments[closer] = m return assignments, distances def _kcenters(metric, ptraj, k=None, distance_cutoff=None, seed=0, verbose=True): """Run kcenters clustering algorithm. Terminates either when `k` clusters have been identified, or when every data is clustered better than `distance_cutoff`. Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` ptraj : prepared trajectory ptraj return by the action of the preceding metric on a msmbuilder trajectory k : {int, None} number of desired clusters, or None distance_cutoff : {float, None} Stop identifying new clusters once the distance of every data to its cluster center falls below this value. Supply either this or `k` seed : int, optional index of the frame to use as the first cluster center verbose : bool, optional print as each new generator is found Returns ------- generator_indices : ndarray indices (with respect to ptraj) of the frames to be considered cluster centers assignments : ndarray the cluster center to which each frame is assigned to (1D) distances : ndarray distance from each of the frames to the cluster center it was assigned to See Also -------- KCenters : wrapper around this implementation that provides more convenience Notes ------ the assignments are numbered with respect to the position in ptraj of the generator, not the position in generator_indices. That is, assignments[10] = 1020 means that the 10th simulation frame is assigned to the 1020th simulation frame, not to the 1020th generator. References ---------- .. [1] Beauchamp, MSMBuilder2 """ if k is None and distance_cutoff is None: raise ValueError("I need some cutoff criterion! both k and " "distance_cutoff can't both be none") if k is None and distance_cutoff <= 0: raise ValueError("With k=None you need to supply a legit distance_cutoff") if distance_cutoff is None: # set it below anything that can ever be reached distance_cutoff = -1 if k is None: k = sys.maxsize distance_list = np.inf * np.ones(len(ptraj), dtype=np.float32) assignments = -1 * np.ones(len(ptraj), dtype=np.int32) generator_indices = [] for i in xrange(k): new_ind = seed if i == 0 else np.argmax(distance_list) if k == sys.maxsize: logger.info("K-centers: Finding generator %i. Will finish when % .4f drops below % .4f", i, distance_list[new_ind], distance_cutoff) else: logger.info("K-centers: Finding generator %i", i) if distance_list[new_ind] < distance_cutoff: break new_distance_list = metric.one_to_all(ptraj, ptraj, new_ind) updated_indices = np.where(new_distance_list < distance_list)[0] distance_list[updated_indices] = new_distance_list[updated_indices] assignments[updated_indices] = new_ind generator_indices.append(new_ind) if verbose: logger.info('KCenters found %d generators', i + 1) return np.array(generator_indices), assignments, distance_list def _clarans(metric, ptraj, k, num_local_minima, max_neighbors, local_swap=True, initial_medoids='kcenters', initial_assignments=None, initial_distance=None, verbose=True): """Run the CLARANS clustering algorithm on the frames in a trajectory Reference --------- .. [1] Ng, R.T, Jan, Jiawei, 'CLARANS: A Method For Clustering Objects For Spatial Data Mining', IEEE Trans. on Knowledge and Data Engineering, vol. 14 no.5 pp. 1003-1016 Sep/Oct 2002 http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1033770 Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` ptraj : prepared trajectory ptraj return by the action of the preceding metric on a msmbuilder trajectory k : int number of desired clusters num_local_minima : int number of local minima in the set of all possible clusterings to identify. Execution time will scale linearly with this parameter. The best of these local minima will be returned. max_neighbors : int number of rejected swaps in a row necessary to declare a proposed clustering a local minima local_swap : bool, optional If true, proposed swaps will be between a medoid and a data point currently assigned to that medoid. If false, the data point for the proposed swap is selected randomly. initial_medoids : {'kcenters', 'random', ndarray}, optional If 'kcenters', run kcenters clustering first to get the initial medoids, and then run the swaps to improve it. If 'random', select the medoids at random. Otherwise, initial_medoids should be a numpy array of the indices of the medoids. initial_assignments : {None, ndarray}, optional If None, initial_assignments will be computed based on the initial_medoids. If you pass in your own initial_medoids, you can also pass in initial_assignments to avoid recomputing them. initial_distances : {None, ndarray}, optional If None, initial_distances will be computed based on the initial_medoids. If you pass in your own initial_medoids, you can also pass in initial_distances to avoid recomputing them. verbose : bool, optional Print information about the swaps being attempted Returns ------- generator_indices : ndarray indices (with respect to ptraj) of the frames to be considered cluster centers assignments : ndarray the cluster center to which each frame is assigned to (1D) distances : ndarray distance from each of the frames to the cluster center it was assigned to """ num_frames = len(ptraj) if initial_medoids == 'kcenters': initial_medoids, initial_assignments, initial_distance = _kcenters(metric, ptraj, k) elif initial_medoids == 'random': initial_medoids = np.random.permutation(np.arange(num_frames))[0:k] initial_assignments, initial_distance = _assign(metric, ptraj, initial_medoids) else: if not isinstance(initial_medoids, np.ndarray): raise ValueError('Initial medoids should be a numpy array') if initial_assignments is None or initial_distance is None: initial_assignments, initial_distance = _assign(metric, ptraj, initial_medoids) if not len(initial_assignments) == num_frames: raise ValueError('Initial assignments is not the same length as ptraj') if not len(initial_distance) == num_frames: raise ValueError('Initial distance is not the same length as ptraj') if not k == len(initial_medoids): raise ValueError('Initial medoids not the same length as k') initial_pmedoids = ptraj[initial_medoids] initial_cost = np.sum(initial_distance) min_cost = initial_cost # these iterations could be parallelized for i in xrange(num_local_minima): logger.info('%s of %s local minima', i, num_local_minima) # the cannonical clarans approach is to initialize the medoids that you # start from randomly, but instead we use the kcenters medoids. medoids = initial_medoids pmedoids = initial_pmedoids assignments = initial_assignments distance_to_current = initial_distance current_cost = initial_cost optimal_medoids = initial_medoids optimal_assignments = initial_assignments optimal_distances = initial_distance # loop over neighbors j = 0 while j < max_neighbors: medoid_i = np.random.randint(k) old_medoid = medoids[medoid_i] if local_swap is False: trial_medoid = np.random.randint(num_frames) else: trial_medoid = random.choice(np.where(assignments == medoids[medoid_i])[0]) new_medoids = medoids.copy() new_medoids[medoid_i] = trial_medoid pmedoids = ptraj[new_medoids] if type(pmedoids) == np.ndarray: pmedoids = pmedoids.copy() new_distances = distance_to_current.copy() new_assignments = assignments.copy() logger.info('swapping %s for %s...', old_medoid, trial_medoid) distance_to_trial = metric.one_to_all(ptraj, ptraj, trial_medoid) assigned_to_trial = np.where(distance_to_trial < distance_to_current)[0] new_assignments[assigned_to_trial] = trial_medoid new_distances[assigned_to_trial] = distance_to_trial[assigned_to_trial] ambiguous = np.where((new_assignments == old_medoid) & \ (distance_to_trial >= distance_to_current))[0] for l in ambiguous: if len(ptraj) <= l: logger.error(len(ptraj)) logger.error(l) logger.error(ptraj.dtype) logger.error(l.dtype) d = metric.one_to_all(ptraj, pmedoids, l) argmin = np.argmin(d) new_assignments[l] = new_medoids[argmin] new_distances[l] = d[argmin] new_cost = np.sum(new_distances) if new_cost < current_cost: logger.info('Accept') medoids = new_medoids assignments = new_assignments distance_to_current = new_distances current_cost = new_cost j = 0 else: j += 1 logger.info('Reject') if current_cost < min_cost: min_cost = current_cost optimal_medoids = medoids.copy() optimal_assignments = assignments.copy() optimal_distances = distance_to_current.copy() return optimal_medoids, optimal_assignments, optimal_distances def _clarans_helper(args): return _clarans(*args) def _hybrid_kmedoids(metric, ptraj, k=None, distance_cutoff=None, num_iters=10, local_swap=True, norm_exponent=2.0, too_close_cutoff=0.0001, ignore_max_objective=False, initial_medoids='kcenters', initial_assignments=None, initial_distance=None): """Run the hybrid kmedoids clustering algorithm to cluster a trajectory References ---------- .. [1] Beauchamp, K. MSMBuilder2 Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` ptraj : prepared trajectory ptraj return by the action of the preceding metric on a msmbuilder trajectory k : int number of desired clusters num_iters : int number of swaps to attempt per medoid local_swap : boolean, optional If true, proposed swaps will be between a medoid and a data point currently assigned to that medoid. If false, the data point for the proposed swap is selected randomly. norm_exponent : float, optional exponent to use in pnorm of the distance to generate objective function too_close_cutoff : float, optional Summarily reject proposed swaps if the distance of the medoid to the trial medoid is less than thus value ignore_max_objective : boolean, optional Ignore changes to the distance of the worst classified point, and only reject or accept swaps based on changes to the p norm of all the data points. initial_medoids : {'kcenters', ndarray} If 'kcenters', run kcenters clustering first to get the initial medoids, and then run the swaps to improve it. If 'random', select the medoids at random. Otherwise, initial_medoids should be a numpy array of the indices of the medoids. initial_assignments : {None, ndarray}, optional If None, initial_assignments will be computed based on the initial_medoids. If you pass in your own initial_medoids, you can also pass in initial_assignments to avoid recomputing them. initial_distances : {None, ndarray}, optional If None, initial_distances will be computed based on the initial_medoids. If you pass in your own initial_medoids, you can also pass in initial_distances to avoid recomputing them. """ if k is None and distance_cutoff is None: raise ValueError("I need some cutoff criterion! both k and distance_cutoff can't both be none") if k is None and distance_cutoff <= 0: raise ValueError("With k=None you need to supply a legit distance_cutoff") if distance_cutoff is None: # set it below anything that can ever be reached distance_cutoff = -1 num_frames = len(ptraj) if initial_medoids == 'kcenters': initial_medoids, initial_assignments, initial_distance = _kcenters(metric, ptraj, k, distance_cutoff) elif initial_medoids == 'random': if k is None: raise ValueError('You need to supply the number of clusters, k, you want') initial_medoids = np.random.permutation(np.arange(num_frames))[0:k] initial_assignments, initial_distance = _assign(metric, ptraj, initial_medoids) else: if not isinstance(initial_medoids, np.ndarray): raise ValueError('Initial medoids should be a numpy array') if initial_assignments is None or initial_distance is None: initial_assignments, initial_distance = _assign(metric, ptraj, initial_medoids) assignments = initial_assignments distance_to_current = initial_distance medoids = initial_medoids pgens = ptraj[medoids] k = len(initial_medoids) obj_func = p_norm(distance_to_current, p=norm_exponent) max_norm = p_norm(distance_to_current, p='max') if not np.all(np.unique(medoids) == np.sort(medoids)): raise ValueError('Initial medoids must be distinct') if not np.all(np.unique(assignments) == sorted(medoids)): raise ValueError('Initial assignments dont match initial medoids') for iteration in xrange(num_iters): for medoid_i in xrange(k): if not np.all(np.unique(assignments) == sorted(medoids)): raise ValueError('Loop invariant lost') if local_swap is False: trial_medoid = np.random.randint(num_frames) else: trial_medoid = random.choice(np.where(assignments == medoids[medoid_i])[0]) old_medoid = medoids[medoid_i] if old_medoid == trial_medoid: continue new_medoids = medoids.copy() new_medoids[medoid_i] = trial_medoid pmedoids = ptraj[new_medoids] new_distances = distance_to_current.copy() new_assignments = assignments.copy() logger.info('Sweep %d, swapping medoid %d (conf %d) for conf %d...', iteration, medoid_i, old_medoid, trial_medoid) distance_to_trial = metric.one_to_all(ptraj, ptraj, trial_medoid) if not np.all(np.isfinite(distance_to_trial)): raise ValueError('distance metric returned nonfinite distances') if distance_to_trial[old_medoid] < too_close_cutoff: logger.info('Too close') continue assigned_to_trial = np.where(distance_to_trial < distance_to_current)[0] new_assignments[assigned_to_trial] = trial_medoid new_distances[assigned_to_trial] = distance_to_trial[assigned_to_trial] ambiguous = np.where((new_assignments == old_medoid) & \ (distance_to_trial >= distance_to_current))[0] for l in ambiguous: d = metric.one_to_all(ptraj, pmedoids, l) if not np.all(np.isfinite(d)): raise ValueError('distance metric returned nonfinite distances') argmin = np.argmin(d) new_assignments[l] = new_medoids[argmin] new_distances[l] = d[argmin] new_obj_func = p_norm(new_distances, p=norm_exponent) new_max_norm = p_norm(new_distances, p='max') if new_obj_func < obj_func and (new_max_norm <= max_norm or ignore_max_objective is True): logger.info("Accept. New f = %f, Old f = %f", new_obj_func, obj_func) medoids = new_medoids assignments = new_assignments distance_to_current = new_distances obj_func = new_obj_func max_norm = new_max_norm else: logger.info("Reject. New f = %f, Old f = %f", new_obj_func, obj_func) return medoids, assignments, distance_to_current ##################################################################### # # # End Clustering Functions # # Begin Clustering Classes # # # #####################################################################
[docs]class Hierarchical(object): allowable_methods = ['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward']
[docs] def __init__(self, metric, trajectories, method='single', precomputed_values=None): """Initialize a hierarchical clusterer using the supplied distance metric and method. Method should be one of the fastcluster linkage methods, namely 'single', 'complete', 'average', 'weighted', 'centroid', 'median', or 'ward'. Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` trajectory : Trajectory list of Trajectorys data to cluster method : {'single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward'} precomputed_values : used internally to implement load_from_disk() Notes ----- This is implemenred with the fastcluster library, which can be downloaded from CRAN http://cran.r-project.org/web/packages/fastcluster/ """ if precomputed_values is not None: precomputed_z_matrix, traj_lengths = precomputed_values if isinstance(precomputed_z_matrix, np.ndarray) and precomputed_z_matrix.shape[1] == 4: self.Z = precomputed_z_matrix self.traj_lengths = traj_lengths return else: raise Exception('Something is wrong') if not isinstance(metric, metrics.AbstractDistanceMetric): raise TypeError('%s is not an abstract distance metric' % metric) if not method in self.allowable_methods: raise ValueError("%s not in %s" % (method, str(self.allowable_methods))) if isinstance(trajectories, md.Trajectory): trajectories = [trajectories] elif isinstance(trajectories, types.GeneratorType): trajectories = list(trajectories) self.traj_lengths = np.array([len(t) for t in trajectories]) # self.ptrajs = [self.metric.prepare_trajectory(traj) for traj in self.trajectories] logger.info('Preparing...') flat_trajectory = concatenate_trajectories(trajectories) pflat_trajectory = metric.prepare_trajectory(flat_trajectory) logger.info('Getting all to all pairwise distance matrix...') dmat = metric.all_pairwise(pflat_trajectory) logger.info('Done with all2all') self.Z = fastcluster.linkage(dmat, method=method, preserve_input=False) logger.info('Got Z matrix') # self.Z = scipy.cluster.hierarchy.linkage(dmat, method=method)
def _oneD_assignments(self, k=None, cutoff_distance=None): """Assign the frames into clusters. Either supply k, the number of clusters desired, or cutoff_distance, a max diameteric of each cluster Parameters ---------- k : int, optional number of clusters desired cutoff_distance : float, optional max diameter of each cluster, as a cutoff Returns ------- assignments_1d : ndarray 1D array with the assignments of the flattened trajectory (internal). See Also -------- Hierarchical.get_assignments """ # note that we subtract 1 from the results given by fcluster since # they start with 1-based numbering, but we want the lowest index cluster # to be number 0 if k is not None and cutoff_distance is not None: raise Exception('You cant supply both a k and cutoff distance') elif k is not None: return scipy.cluster.hierarchy.fcluster(self.Z, k, criterion='maxclust') - 1 elif cutoff_distance is not None: return scipy.cluster.hierarchy.fcluster(self.Z, cutoff_distance, criterion='distance') - 1 else: raise Exception('You need to supply either k or a cutoff distance') def get_assignments(self, k=None, cutoff_distance=None): """Assign the frames into clusters. Either supply k, the number of clusters desired, or cutoff_distance, a max diameteric of each cluster Parameters ---------- k : int, optional number of clusters desired cutoff_distance : float, optional max diameter of each cluster, as a cutoff Returns ------- assignments : ndarray 2D array of shape num_trajs x length of longest traj. Padded with -1s at the end if not all trajectories are the same length """ assgn_list = split(self._oneD_assignments(k, cutoff_distance), self.traj_lengths) output = -1 * np.ones((len(self.traj_lengths), max(self.traj_lengths)), dtype='int') for i, traj_assign in enumerate(assgn_list): output[i][0:len(traj_assign)] = traj_assign return output def save_to_disk(self, filename): """Save this clusterer to disk. This is useful because computing the Z-matrix (done in __init__) is the most expensive part, and assigning is cheap Parameters ---------- filename : str location to save to Raises ------ Exception if something already exists at `filename` """ io.saveh(filename, z_matrix=self.Z, traj_lengths=self.traj_lengths) @classmethod def load_from_disk(cls, filename): """Load up a clusterer from disk This is useful because computing the Z-matrix (done in __init__) is the most expensive part, and assigning is cheap Parameters ---------- filename : str location to save to Raises ------ TODO: Probablt raises something if filename doesn't exist? """ data = io.loadh(filename, deferred=False) Z, traj_lengths = data['z_matrix'], data['traj_lengths'] # Next two lines are a hack to fix Serializer bug. KAB if np.rank(traj_lengths) == 0: traj_lengths = [traj_lengths] return cls(None, None, precomputed_values=(Z, traj_lengths))
[docs]class BaseFlatClusterer(object): """ (Abstract) base class / mixin that Clusterers can extend. Provides convenience functions for the user. To implement a clusterer using this base class, subclass it and define your init method to do the clustering you want, and then set self._generator_indices, self._assignments, and self._distances with the result. For convenience (and to enable some of its functionality), let BaseFlatCluster prepare the trajectories for you by calling BaseFlatClusterer's __init__ method and then using the prepared, concatenated trajectory self.ptraj for your clustering. """
[docs] def __init__(self, metric, trajectories=None, prep_trajectories=None): if not isinstance(metric, metrics.AbstractDistanceMetric): raise TypeError('%s is not an AbstractDistanceMetric' % metric) # if we got a single trajectory instead a list of trajectories, make it a # list if not trajectories is None: if isinstance(trajectories, md.Trajectory): trajectories = [trajectories] elif isinstance(trajectories, types.GeneratorType): trajectories = list(trajectories) self._concatenated = concatenate_trajectories(trajectories) if prep_trajectories is None: self.ptraj = metric.prepare_trajectory(self._concatenated) self._traj_lengths = [len(traj) for traj in trajectories] if not prep_trajectories is None: # If they also provide trajectories # that's fine, but we will use the prep_trajectories if isinstance(prep_trajectories, np.ndarray) or \ isinstance(prep_trajectories[0], np.ndarray): prep_trajectories = np.array(prep_trajectories) if len(prep_trajectories.shape) == 2: prep_trajectories = [prep_trajectories] else: # 3D means a list of prep_trajectories was input prep_trajectories = list(prep_trajectories) if trajectories is None: self._traj_lengths = [len(ptraj) for ptraj in prep_trajectories] self._concatenated = None self.ptraj = concatenate_prep_trajectories(prep_trajectories, metric) if trajectories is None and prep_trajectories is None: raise Exception("must provide at least one of trajectories and prep_trajectories") self._metric = metric self.num_frames = sum(self._traj_lengths) # All the actual Clusterer objects that subclass this base class # need to calculate these three parameters and store them here # self._generator_indices[i] = j means that the jth frame of self.ptraj is # considered a generator. self._assignments[i] = j should indicate that # self.ptraj[j] is the coordinates of the the cluster center corresponding to i # and self._distances[i] = f should indicate that the distance from self.ptraj[i] # to self.ptraj[self._assignments[i]] is f. self._generator_indices = 'abstract' self._assignments = 'abstract' self._distances = 'abstract'
def _ensure_generators_computed(self): if self._generator_indices == 'abstract': raise Exception('Your subclass of BaseFlatClusterer is implemented wrong and didnt compute self._generator_indicies.') def _ensure_assignments_and_distances_computed(self): if self._assignments == 'abstract' or self._distances == 'abstract': self._assignments, self._distances = _assign(self._metric, self.ptraj, self._generator_indices) def get_assignments(self): """Assign the trajectories you passed into the constructor based on generators that have been identified Returns ------- assignments : ndarray 2D array of assignments where k = assignments[i,j] means that the jth frame in the ith trajectory is assigned to the center whose coordinates are in the kth frame of the trajectory in get_generators_as_traj() """ self._ensure_generators_computed() self._ensure_assignments_and_distances_computed() twoD = split(self._assignments, self._traj_lengths) # the numbers in self._assignments are indices with respect to self.ptraj, # but we want indices with respect to the number in the trajectory of generators # returned by get_generators_as_traj() ptraj_index_to_gens_traj_index = np.zeros(self.num_frames) for i, g in enumerate(self._generator_indices): ptraj_index_to_gens_traj_index[g] = i # put twoD into a rectangular array output = -1 * np.ones((len(self._traj_lengths), max(self._traj_lengths)), dtype=np.int32) for i, traj_assign in enumerate(twoD): output[i, 0:len(traj_assign)] = ptraj_index_to_gens_traj_index[traj_assign] return output def get_distances(self): """Extract the distance from each frame to its assigned cluster kcenter Returns ------- distances : ndarray 2D array of size num_trajs x length of longest traj, such that distances[i,j] gives the distance from the ith trajectorys jth frame to its assigned cluster center """ self._ensure_generators_computed() self._ensure_assignments_and_distances_computed() twoD = split(self._distances, self._traj_lengths) # put twoD into a rectangular array output = -1 * np.ones((len(self._traj_lengths), max(self._traj_lengths)), dtype='float32') for i, traj_distances in enumerate(twoD): output[i][0:len(traj_distances)] = traj_distances return output def get_generators_as_traj(self): """Get a trajectory containing the generators Returns ------- traj or ptraj : msmbuilder.Trajectory or np.ndarray a trajectory object where each frame is one of the generators/medoids identified. If trajectories was not originally provided, then will only return the prepared generators """ self._ensure_generators_computed() if self._concatenated is None: output = self.ptraj[self._generator_indices] else: output = self._concatenated[self._generator_indices] return output def get_generator_indices(self): """Get the generator indices corresponding to frames in self.ptraj. Returns ------- gen_inds : np.ndarray generator indices corresponding to the generators in self.ptraj """ return self._generator_indices
[docs]class KCenters(BaseFlatClusterer):
[docs] def __init__(self, metric, trajectories=None, prep_trajectories=None, k=None, distance_cutoff=None, seed=0): """Run kcenters clustering algorithm. Terminates either when `k` clusters have been identified, or when every data is clustered better than `distance_cutoff`. Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` trajectory : Trajectory or list of msmbuilder.Trajectory data to cluster k : {int, None} number of desired clusters, or None distance_cutoff : {float, None} Stop identifying new clusters once the distance of every data to its cluster center falls below this value. Supply either this or `k` seed : int, optional index of the frame to use as the first cluster center See Also -------- _kcenters : implementation References ---------- .. [1] Beauchamp, MSMBuilder2 """ s = super(KCenters, self) if PY2 else super() s.__init__(metric, trajectories, prep_trajectories) gi, asgn, dl = _kcenters(metric, self.ptraj, k, distance_cutoff, seed) # note that the assignments here are with respect to the numbering # in the trajectory -- they are not contiguous. Using the get_assignments() # method defined on the superclass (BaseFlatClusterer) will convert them # back into the contiguous numbering scheme (with respect to position in the # self._generator_indices). self._generator_indices = gi self._assignments = asgn self._distances = dl
[docs]class Clarans(BaseFlatClusterer):
[docs] def __init__(self, metric, trajectories=None, prep_trajectories=None, k=None, num_local_minima=10, max_neighbors=20, local_swap=False): """Run the CLARANS clustering algorithm on the frames in a trajectory Reference --------- .. [1] Ng, R.T, Jan, Jiawei, 'CLARANS: A Method For Clustering Objects For Spatial Data Mining', IEEE Trans. on Knowledge and Data Engineering, vol. 14 no.5 pp. 1003-1016 Sep/Oct 2002 http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1033770 Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` trajectory : Trajectory or list of msmbuilder.Trajectory data to cluster k : int number of desired clusters num_local_minima : int number of local minima in the set of all possible clusterings to identify. Execution time will scale linearly with this parameter. The best of these local minima will be returned. max_neighbors : int number of rejected swaps in a row necessary to declare a proposed clustering a local minima local_swap : bool, optional If true, proposed swaps will be between a medoid and a data point currently assigned to that medoid. If false, the data point for the proposed swap is selected randomly. See Also -------- _kcenters : implementation SubsampledClarans : random subsampling version (faster) """ s = super(Clarans, self) if PY2 else super() s.__init__(metric, trajectories, prep_trajectories) medoids, assignments, distances = _clarans(metric, self.ptraj, k, num_local_minima, max_neighbors, local_swap, initial_medoids='kcenters') self._generator_indices = medoids self._assignments = assignments self._distances = distances
class SubsampledClarans(BaseFlatClusterer): def __init__(self, metric, trajectories=None, prep_trajectories=None, k=None, num_samples=None, shrink_multiple=None, num_local_minima=10, max_neighbors=20, local_swap=False, parallel=None): """ Run the CLARANS algorithm (see the Clarans class for more description) on multiple subsamples of the data drawn randomly. Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` trajectories : Trajectory or list of msmbuilder.Trajectory data to cluster prep_trajectories : np.ndarray or None prepared trajectories instead of msmbuilder.Trajectory k : int number of desired clusters num_samples : int number of random subsamples to draw shrink_multiple : int Each of the subsamples drawn will be of size equal to the total number of frames divided by this number num_local_minima : int, optional number of local minima in the set of all possible clusterings to identify. Execution time will scale linearly with this parameter. The best of these local minima will be returned. max_neighbors : int, optional number of rejected swaps in a row necessary to declare a proposed clustering a local minima local_swap : bool, optional If true, proposed swaps will be between a medoid and a data point currently assigned to that medoid. If false, the data point for the proposed swap is selected randomly parallel : {None, 'multiprocessing', 'dtm} Which parallelization library to use. Each of the random subsamples are run independently """ s = super(SubsampledClarans, self) if PY2 else super() s.__init__(metric, trajectories, prep_trajectories) if parallel is None: mymap = map elif parallel == 'multiprocessing': mymap = Pool().map elif parallel == 'dtm': mymap = dtm.map else: raise ValueError('Unrecognized parallelization') # function that returns a list of random indices gen_sub_indices = lambda: np.array(random.sample(np.arange(self.num_frames), self.num_frames / shrink_multiple)) # gen_sub_indices = lambda: np.arange(self.num_frames) sub_indices = [gen_sub_indices() for i in range(num_samples)] ptrajs = [self.ptraj[sub_indices[i]] for i in range(num_samples)] clarans_args = uneven_zip(metric, ptrajs, k, num_local_minima, max_neighbors, local_swap, ['kcenters'], None, None, False) results = mymap(_clarans_helper, clarans_args) medoids_list, assignments_list, distances_list = list(zip(*results)) best_i = np.argmin([np.sum(d) for d in distances_list]) # print 'best i', best_i # print 'best medoids (relative to subindices)', medoids_list[best_i] # print 'sub indices', sub_indices[best_i] # print 'best_medoids', sub_indices[best_i][medoids_list[best_i]] self._generator_indices = sub_indices[best_i][medoids_list[best_i]]
[docs]class HybridKMedoids(BaseFlatClusterer):
[docs] def __init__(self, metric, trajectories=None, prep_trajectories=None, k=None, distance_cutoff=None, local_num_iters=10, global_num_iters=0, norm_exponent=2.0, too_close_cutoff=.0001, ignore_max_objective=False): """Run the hybrid kmedoids clustering algorithm on a set of trajectories Parameters ---------- metric : msmbuilder.metrics.AbstractDistanceMetric A metric capable of handling `ptraj` trajectory : Trajectory or list of msmbuilder.Trajectory data to cluster k : int number of desired clusters num_iters : int number of swaps to attempt per medoid local_swap : boolean, optional If true, proposed swaps will be between a medoid and a data point currently assigned to that medoid. If false, the data point for the proposed swap is selected randomly. norm_exponent : float, optional exponent to use in pnorm of the distance to generate objective function too_close_cutoff : float, optional Summarily reject proposed swaps if the distance of the medoid to the trial medoid is less than thus value ignore_max_objective : boolean, optional Ignore changes to the distance of the worst classified point, and only reject or accept swaps based on changes to the p norm of all the data points. References ---------- .. [1] Beauchamp, K, et. al. MSMBuilder2 See Also -------- KCenters : faster, less accurate Clarans : slightly more clever termination criterion """ s = super(HybridKMedoids, self) if PY2 else super() s.__init__(metric, trajectories, prep_trajectories) medoids, assignments, distances = _hybrid_kmedoids( metric, self.ptraj, k, distance_cutoff, local_num_iters, True, norm_exponent, too_close_cutoff, ignore_max_objective, initial_medoids='kcenters') if global_num_iters != 0: medoids, assignments, distances = _hybrid_kmedoids( metric, self.ptraj, k, distance_cutoff, global_num_iters, False, norm_exponent, too_close_cutoff, ignore_max_objective, medoids, assignments, distances) self._generator_indices = medoids self._assignments = assignments self._distances = distances
Versions