# Python >= 3.11 pip install aaanalysis # core pip install 'aaanalysis[pro]' # SHAP, FIMO, Bio import numpy as np import matplotlib.pyplot as plt import aaanalysis as aa df_seq = aa.load_dataset(name='DOM_GSEC') # γ-secretase labels = df_seq['label'].to_list() df_scales = aa.load_scales()
| TMD | Target Middle Domain — the central segment of interest (e.g. transmembrane domain); variable length. |
| JMD | Juxta Middle Domain — the fixed-width flanks adjoining the TMD (jmd_n on the N-side, jmd_c on the C-side). |
split_kws = sf.get_split_kws(
split_types="Segment",
n_split_max=1)
cpp = aa.CPP(df_parts=df_parts, split_kws=split_kws)
split_kws = sf.get_split_kws(
split_types=["Segment", "Pattern", "PeriodicPattern"],
n_split_max=5,
steps_pattern=[3, 4],
steps_periodicpattern=[3, 4])
cpp = aa.CPP(df_parts=df_parts, split_kws=split_kws)
import numpy as np, matplotlib.pyplot as plt, aaanalysis as aa df_seq = aa.load_dataset(name='DOM_GSEC') # γ-secretase labels = list(df_seq['label']); df_scales = aa.load_scales() sf = aa.SequenceFeature() df_parts = sf.get_df_parts(df_seq=df_seq, list_parts=['tmd', 'jmd_n', 'jmd_c']) aa.plot_settings(font_scale=0.7) # aal_kws builds df_logo + bits bar for you aa.AAlogoPlot().single_logo( aal_kws=dict(df_parts=df_parts, labels=labels, label_test=1, tmd_len=20), name_data='Test set: substrates') plt.tight_layout(); plt.show()

# default parts + a redundancy-reduced set of 100 scales df_parts = sf.get_df_parts(df_seq=df_seq) df_scales = aa.AAclust().select_scales( df_scales=df_scales, n_clusters=100) cpp = aa.CPP(df_parts=df_parts, df_scales=df_scales) df_feat = cpp.run(labels=labels, n_filter=100) X = sf.feature_matrix(df_feat['feature'], df_parts) tm = aa.TreeModel(); tm.fit(X, labels=labels) df_feat = tm.add_feat_importance(df_feat=df_feat, sort=True) cpp_plot = aa.CPPPlot(); aa.plot_settings() # distribution of the top feature (feat_rank=1 of the sorted df_feat) cpp_plot.feature(feature=df_feat, feat_rank=1, df_seq=df_seq, labels=labels, name_test='substrates', name_ref='non-subs.') plt.tight_layout(); plt.show()
# same df_feat — rank the top discriminative features aa.plot_settings(font_scale=0.6) cpp_plot.ranking(df_feat=df_feat, n_top=15, rank=True, name_test='substrates', name_ref='non-subs.') plt.tight_layout(); plt.show()


# global Part × Split × Scale map — all AAontology scales cpp_plot = aa.CPPPlot(); aa.plot_settings(font_scale=0.65) cpp_plot.feature_map(df_feat=df_feat) # CPP.simplify → fewer, interpretable correlated scales df_feat = cpp.simplify(df_feat=df_feat, labels=labels) cpp_plot.feature_map(df_feat=df_feat) plt.tight_layout(); plt.show()


# per-sample SHAP — APP's soft label (0.6), keyed by entry sm = aa.ShapModel() sm.fit(X, labels=labels, df_seq=df_seq, fuzzy_labels={'P05067': 0.6}) df_feat = sm.add_feat_impact(df_feat=df_feat, df_seq=df_seq, samples='P05067', names='APP') args_seq = sf.get_args_seq(df_seq=df_seq, sample='P05067') ka = dict(col_imp='feat_impact_APP', shap_plot=True, **args_seq) cpp_plot.profile(df_feat=df_feat, **ka) # vmin/vmax=±21% → same colour scale as the global feature map (comparable) cpp_plot.feature_map(df_feat=df_feat, name_test='APP', vmin=-21, vmax=21, **ka) plt.tight_layout(); plt.show()


aac = aa.AAclust() # pick a redundancy-reduced set of scales aac.select_scales(df_scales, n_clusters=10) aac.medoid_names_ # 10 reduced scales (labels_ also set) aac_plot = aa.AAclustPlot() aac_plot.centers(df_scales=df_scales, labels=aac.labels_) plt.tight_layout(); plt.show() # AAclust also reduces redundant proteins (not just scales) df_seq = aac.select_proteins(df_seq=df_seq, X=X)
# DOM_GSEC ships 1/0 — treat 0 as the unlabeled pool (label_unl=0) dpul = aa.dPULearn() dpul.fit(X=X, labels=labels, label_unl=0, n_neg=31) # n_neg: reliable negatives to mine df_pu = dpul.df_pu_ # out: 1 pos · 0 rel-neg · 2 unl dpul_plot = aa.dPULearnPlot() dpul_plot.pca(df_pu=df_pu, labels=dpul.labels_) plt.tight_layout(); plt.show()


# Reference windows around sites when you lack negatives: aaws = aa.AAWindowSampler() # SAME proteins · window 9 (odd) -> PTM / single-residue site df_same = aaws.sample_same_protein(df_seq, n=100, window_size=9) # DIFFERENT proteins · window 10 (even) -> cleavage bond df_diff = aaws.sample_different_protein(df_seq, n=100, window_size=10) # SYNTHETIC — AA-frequency priors (null background) df_syn = aaws.sample_synthetic(df_seq, n=100, generator='global_freq')
| df_seq | entry · sequence · label · tmd_start · tmd_stop |
| df_parts | one column per part: tmd · jmd_n · jmd_c · … |
| df_feat | feature · category · subcategory · scale_name · abs_auc · mean_dif · p_val · positions |
| X | feature matrix (samples × features) from sf.feature_matrix |
| dict_num | {entry: ndarray (L×D)} — numerical per-residue values |
| class | abbr | plot class | kind |
| SequencePreprocessor | sp | — | |
| EmbeddingPreprocessor | ep | — | |
| StructurePreprocessor pro | stp | — | |
| AnnotationPreprocessor pro | ap | — | |
| AAlogo | aal | AAlogoPlot | |
| AAWindowSampler | aaws | — | |
| SequenceFeature | sf | — | |
| NumericalFeature | nf | — | |
| AAclust | aac | AAclustPlot | Wrapper |
| CPP | cpp | CPPPlot | |
| dPULearn | dpul | dPULearnPlot | Wrapper |
| TreeModel | tm | — | Wrapper |
| ShapModel pro | sm | — | Wrapper |
| AAMut | aamut | AAMutPlot | |
| SeqMut | seqmut | SeqMutPlot |
aa.options['random_state'] = 42 aa.options['verbose'] = True aa.options['n_jobs'] = -1 # all cores (None = auto) aa.options['allow_multiprocessing'] = True # TMD model — JMD flank widths aa.options['jmd_n_len'] = 10 aa.options['jmd_c_len'] = 10 # plot labels & system-level scales aa.options['name_tmd'] = 'P5-P5′' # e.g. cleavage site aa.options['df_scales'] = my_scales