This example builds HMM and MSMs on the alanine_dipeptide dataset using varing lag times and numbers of states, and compares the relaxation timescales
from __future__ import print_function
import os
from matplotlib.pyplot import *
from msmbuilder.featurizer import SuperposeFeaturizer
from msmbuilder.example_datasets import AlanineDipeptide
from msmbuilder.hmm import GaussianFusionHMM
from msmbuilder.cluster import KCenters
from msmbuilder.msm import MarkovStateModel
Featurization refers to the process of converting the conformational snapshots from your MD trajectories into vectors in some space $\mathbb{R}^N$ that can be manipulated and modeled by subsequent analyses. The Gaussian HMM, for instance, uses Gaussian emission distributions, so it models the trajectory as a time-dependent mixture of multivariate Gaussians.
In general, the featurization is somewhat of an art. For this example, we're using Mixtape's SuperposeFeaturizer
, which superposes each snapshot onto a reference frame (trajectories[0][0]
in this example), and then measure the distance from each
atom to its position in the reference conformation as the 'feature'
print(AlanineDipeptide.description())
dataset = AlanineDipeptide().get()
trajectories = dataset.trajectories
topology = trajectories[0].topology
indices = [atom.index for atom in topology.atoms if atom.element.symbol in ['C', 'O', 'N']]
featurizer = SuperposeFeaturizer(indices, trajectories[0][0])
sequences = featurizer.transform(trajectories)
sequences
is our featurized data.lag_times = [1, 10, 20, 30, 40]
hmm_ts0 = {}
hmm_ts1 = {}
n_states = [3, 5]
for n in n_states:
hmm_ts0[n] = []
hmm_ts1[n] = []
for lag_time in lag_times:
strided_data = [s[i::lag_time] for s in sequences for i in range(lag_time)]
hmm = GaussianFusionHMM(n_states=n, n_features=sequences[0].shape[1], n_init=1).fit(strided_data)
timescales = hmm.timescales_ * lag_time
hmm_ts0[n].append(timescales[0])
hmm_ts1[n].append(timescales[1])
print('n_states=%d\tlag_time=%d\ttimescales=%s' % (n, lag_time, timescales))
print()
figure(figsize=(14,3))
for i, n in enumerate(n_states):
subplot(1,len(n_states),1+i)
plot(lag_times, hmm_ts0[n])
plot(lag_times, hmm_ts1[n])
if i == 0:
ylabel('Relaxation Timescale')
xlabel('Lag Time')
title('%d states' % n)
show()
msmts0, msmts1 = {}, {}
lag_times = [1, 10, 20, 30, 40]
n_states = [4, 8, 16, 32, 64]
for n in n_states:
msmts0[n] = []
msmts1[n] = []
for lag_time in lag_times:
assignments = KCenters(n_clusters=n).fit_predict(sequences)
msm = MarkovStateModel(lag_time=lag_time, verbose=False).fit(assignments)
timescales = msm.timescales_
msmts0[n].append(timescales[0])
msmts1[n].append(timescales[1])
print('n_states=%d\tlag_time=%d\ttimescales=%s' % (n, lag_time, timescales[0:2]))
print()
figure(figsize=(14,3))
for i, n in enumerate(n_states):
subplot(1,len(n_states),1+i)
plot(lag_times, msmts0[n])
plot(lag_times, msmts1[n])
if i == 0:
ylabel('Relaxation Timescale')
xlabel('Lag Time')
title('%d states' % n)
show()
(hmm-and-msm.ipynb; hmm-and-msm_evaluated.ipynb; hmm-and-msm.py)