#!/usr/bin/env python
###################################################################
## Primary Author: Rohit Singh rsingh@alum.mit.edu
## Co-Authors: Ashwin Narayan, Brian Hie {ashwinn,brianhie}@mit.edu
## License: MIT
## Repository: http://github.io/rs239/schema
###################################################################
import pandas as pd
import numpy as np
import scipy, sklearn
import os, sys, string, fileinput, glob, re, math, itertools, functools, copy, logging, warnings
import sklearn.decomposition, sklearn.preprocessing
import cvxopt
#### local directory imports ####
oldpath = copy.copy(sys.path)
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from schema_base_config import *
sys.path = copy.copy(oldpath)
####
[docs]class SchemaQP:
"""Schema is a tool for integrating simultaneously-assayed data modalities
The SchemaQP class provides a sklearn type fit+transform API for constrained
affine transformations of input datasets such that the transformed data is
in agreement with all the input datasets.
"""
def __init__(self, min_desired_corr=0.99, mode="affine", params={}):
"""
:param min_desired_corr:
This parameter controls the severity of the primary modality's
transformation, specifying the minimum required correlation
between distances in the original space and those in the
transformed space. It thus controls the trade-off between
deviating further away from the primary modality's original
representation and achieving greater agreement with the
secondary modalities. Values close to one result in lower
distortion of the primary modality while those close to zero
enable transformations offering greater agreement
between the modalities.
RECOMMENDED VALUES: In typical single-cell use cases, high
values (> 0.80) will probably work best. With these, the
distortion will be low, but still be enough for Schema to
extract relevant information from the secondary modalities.
Furthermore, the feature weights computed by Schema should
still be quite infromative.
The default value of 0.99 is a safe choice to start with; it
poses low risk of deviating too far from the primary modality.
Later, you can experiment with a range of values (e.g., 0.95
0.90, 0.80), or use feature-weights aggregated across an
ensemble of choices. Alternatively, you can use
cross-validation to identify the best setting
:type min_desired_corr: float in [0,1)
:param mode:
Whether to perform a general affine transformation or just a
scaling transformation
* `affine` first does a mapping to PCA or NMF space (you can
specify num_top_components via the `params` argument). Schema does
a scaling transform in the mapped space and then converts everything
back to the regular space. The final result is thus an affine
transformation in the regular space.
* `scale` does not do a PCA or NMF mapping, and directly applies
the scaling transformation. **Note**: This can be slow if
the primary modality's dimensionality is over 100.
RECOMMENDED VALUES: `affine` is the default. You may need `scale`
only in certain cases:
* You have a limited number of features on which you directly
want Schema to compute feature-weights.
* You want to do a change-of-basis transform other PCA or NMF.
If so, you will need to do that yourself and then call
SchemaQP with the transformed primary dataset with
mode='scale'.
:type mode: string
:param params:
Dictionary of key-value pairs, specifying additional
configuration parameters. Here are the important ones:
* `decomposition_model`: "pca" or "nmf" (default=pca)
* `num_top_components`: (default=50) number of PCA (or NMF)
components to use when mode=="affine". We recommend this
setting be <= 100. Schema's runtime is quadratic in this
number.
You can ignore the rest on your first pass; the default
values are pretty reasonable:
* `dist_npairs`: (default=2000000). How many pt-pairs to use
for computing pairwise distances. value=None means
compute exhaustively over all n*(n-1)/2 pt-pairs. Not
recommended for n>5000. Otherwise, the given number of
pt-pairs is sampled randomly. The sampling is done in a
way in which each point will be represented roughly
equally.
* `scale_mode_uses_standard_scaler`: 1 or 0 (default=0),
apply the standard scaler in the scaling mode
* `do_whiten`: 1 or 0 (default=1). When mode=="affine",
should the change-of-basis loadings be made 1-variance?
:type params: dict
:returns: A SchemaQP object on which you can call fit(...), transform(...) or fit_transform(....).
"""
if not (mode in ['scale', 'affine']): raise ValueError("'mode' must be one of ['affine','scale']")
if not (0 <= min_desired_corr < 1): raise ValueError("'min_desired_corr' must be in the range [0,1)")
self._mode = mode
self._w_max_to_avg = 1000
schema_info ("Flag 456.10 ", self._w_max_to_avg)
self._min_desired_corr = min_desired_corr
self._std_scaler = None
self._decomp_mdl = None
self._orig_decomp_mdl = None
# defaults
self._params = {"decomposition_model": "nmf",
"num_top_components": 20,
"dist_npairs": 2000000,
"d0_dist_transform": None,
"secondary_data_dist_transform_list": None,
"d0_orig_transformed_corr": None,
"dist_df_frac": 1.0,
"scale_mode_uses_standard_scaler": 0,
"do_whiten": 1,
"require_nonzero_lambda": 0,
"d0_type_is_feature_vector_categorical": 0,
}
self._params.update(params) # overwrite with user settings
if not (self._params["decomposition_model"].lower() in ["pca","nmf"]):
raise ValueError("For change-of-basis transforms other than PCA/NMF, please do the transformation yourself and use SchemaQP with mode='scale'")
if not (self._params['num_top_components'] is None or self._params["num_top_components"] >= 2): raise ValueError('Need num_top_components >= 2')
[docs] def reset_mincorr_param(self, min_desired_corr):
"""Reset the min_desired_corr.
Useful when you want to iterate over multiple choices of this parameter
but want to re-use the computed PCA or NMF change-of-basis transform.
:param min_desired_corr:
The new value of minimum required correlation between original and transformed distances
:type min_desired_corr: float in [0,1)
"""
if min_desired_corr is None or not(0 <= min_desired_corr < 1): raise ValueError("'min_desired_corr' must be between 0 and 1")
self._min_desired_corr = min_desired_corr
[docs] def reset_maxwt_param(self, w_max_to_avg):
""" Reset the w_max_to_avg param
:type w_max_to_avg: float
:param w_max_to_avg:
The upper-bound on the ratio of Schema weights (w's) largest
element to w's avg element. Making it large will allow This
parameter controls the 'deviation' in feature weights and make it
large will allow for more severe transformations.
**Handle with care:** We recommend keeping this parameter at its
default value (1000); that keeps this constraint very loose and
ensures that min_desired_corr remains the binding constraint.
Later, as you get a better sense for the right min_desired_corr
values for your data, you can experiment with this too. To really
constrain this, set it in the (1-5] range, depending on how many
features you have.
"""
if w_max_to_avg is None or w_max_to_avg <= 1: raise ValueError("'w_max_to_avg' must be either None or greater than 1")
self._w_max_to_avg = w_max_to_avg
[docs] def fit(self, d, secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list = None, secondary_data_dist_kernels=None, d0 = None, d0_dist_transform=None):
"""Compute the optimal Schema transformation, first performing a
change-of-basis transformation if required.
Given the primary dataset `d` and a list of secondary datasets, fit a
linear transformation (`d_new`) such that the correlation between
squared pairwise distances in `d_new` and those in secondary datasets is
maximized while the correlation between the original `d` and the transformed `d_new`
remains above min_desired_corr.
The first three arguments are required, the next is useful, and
the rest should be rarely used.
:type d: Numpy 2-d `array` or Pandas `dataframe`
:param d:
The primary dataset (e.g. scanpy/anndata's .X).
The rows are observations (e.g., cells) and the cols are variables (e.g.,
gene expression). The default distance measure computed is L2:
sum((point1-point2)**2). Also see `d0_dist_transform`.
:type secondary_data_val_list: list of 1-d or 2-d Numpy arrays or Pandas series, each with same number of rows as `d`
:param secondary_data_val_list:
The secondary datasets you want to align the primary data towards.
Columns in Anndata .obs or .obsm variables work well.
:type secondary_data_type_list: list of strings
:param secondary_data_type_list:
The datatypes of the secondary modalities.
Each element of the list can be one of `numeric,
feature_vector, categorical, feature_vector_categorical`.
The list's length should match the length
of secondary_data_val_list
* `numeric`: one floating-pt value for each observation. The
default distance measure is Euclidean: (point1-point2)^2
* `feature_vector`: a k-dimensional vector for each
observation. The default distance measure is Euclidean:
sum_{i}((point1[i]-point2[i])^2)
* `categorical`: a label for each observation. The default
distance measure checks for equality: 1*(val1!=val2)
* `feature_vector_categorical`: a vector of labels for each
observation. Each column can take on categorical values, so
the distance between two points is
sum_{i}(point1[i]==point2[i])
:type secondary_data_wt_list: list of floats, **optional**
:param secondary_data_wt_list:
User-specified wts for each secondary dataset (default= list of 1's)
If specified, the list's length should match the length of
secondary_data_val_list. When multiple secondary modalities are
specified, this parameter allows you to control their relative
weight in seeking an agreement with the primary.
**Note**: you can try to get a mapping that *disagrees* with a
dataset_info instead of *agreeing*. To do so, pass in a
negative number (e.g., -1) here. This works even if you have
just one secondary dataset
:type secondary_data_dist_kernels: list of functions, **optional**
:param secondary_data_dist_kernels:
The transformations to apply on
secondary dataset's L2 distances before using them for correlations.
If specified, the length of the list should match that of
`secondary_data_val_list`.
Each function should take a non-negative float and return a non-negative float.
**Handle with care:** Most likely, you don't need this parameter.
:type d0: A 1-d or 2-d Numpy array, **optional**
:param d0:
An alternative representation of the primary dataset.
This is useful if you want to provide the primary dataset in two
forms: one for transforming and another one for computing pairwise
distances to use in the QP constraint; if so, `d` is used for the
former, while `d0` is used for the latter.
If specified, it should have the same number of rows as `d`.
**Handle with care:** Most likely, you don't need this parameter.
:type d0_dist_transform: float -> float function, **optional**
:param d0_dist_transform:
The transformation to apply on d or d0's L2
distances before using them for correlations.
This function should take a non-negative float as input and return a non-negative float.
**Handle with care:** Most likely, you don't need this parameter.
:returns: None
"""
if not (d.ndim==2): raise ValueError('d should be a 2-d array')
if not (len(secondary_data_val_list) >0): raise ValueError('secondary_data_val_list can not be empty')
if not (len(secondary_data_val_list)==len(secondary_data_type_list)):
raise ValueError('secondary_data_type_list should have the same length as secondary_data_val_list')
if not (secondary_data_wt_list is None or len(secondary_data_wt_list)==len(secondary_data_val_list)):
raise ValueError('secondary_data_wt_list should have the same length as secondary_data_val_list')
for i in range(len(secondary_data_val_list)):
if not (secondary_data_type_list[i] in ['categorical','numeric','feature_vector', 'feature_vector_categorical']):
raise ValueError('{0}-th entry in secondary_data_type_list is invalid'.format(i+1))
if not (secondary_data_val_list[i].shape[0] == d.shape[0]):
raise ValueError('{0}-th entry in secondary_data_val_list has incorrect rows'.format(i+1))
if not ((secondary_data_type_list[i]=='categorical' and secondary_data_val_list[i].ndim==1) or
(secondary_data_type_list[i]=='numeric' and secondary_data_val_list[i].ndim==1) or
(secondary_data_type_list[i]=='feature_vector' and secondary_data_val_list[i].ndim==2) or
(secondary_data_type_list[i]=='feature_vector_categorical' and secondary_data_val_list[i].ndim==2)):
raise ValueError('{0}-th entry in secondary_data_val_list does not match specified type'.format(i+1))
if not (d0 is None or d0.shape[0] == d.shape[0]): raise ValueError('d0 has incorrect rows')
self._params["d0_dist_transform"] = d0_dist_transform
self._params["secondary_data_dist_transform_list"] = secondary_data_dist_kernels
## Before calling the worker functions, convert to np arrays and check for NaNs
def fconv_to_np(x):
if isinstance(x, pd.DataFrame) or isinstance(x, pd.Series): return x.values
if isinstance(x, list): return np.array(x)
return x
def any_nans(npobj):
if npobj.dtype.kind != 'f': return False
v = np.sum(npobj)
return np.isnan(v) or np.isinf(v)
t_d = fconv_to_np(d)
t_secondary_data_val_list = [ fconv_to_np(v) for v in secondary_data_val_list]
t_d0 = fconv_to_np(d0)
if any_nans(t_d): raise ValueError('d should not have any NaN or inf values')
if t_d0 is not None and any_nans(t_d0): raise ValueError('d0 should not have any NaN or inf values')
for i, secd in enumerate(t_secondary_data_val_list):
if any_nans(secd): raise ValueError('{}-th entry in secondary_data_val_list has NaN or inf values'.format(i+1))
np.random.seed(0) # try to get repeated runs to look as similar as possible
if self._mode=="scale":
self._fit_scale(t_d, t_d0, t_secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list)
else: #affine
self._fit_affine(t_d, t_d0, t_secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list)
[docs] def feature_weights(self, affine_map_style='top-k-loading', k=1):
"""
Return the feature weights computed by Schema
If SchemaQP was initialized with `mode=scale`, the weights
returned are directly the weights from the quadratic programming
(QP), with a weight > 1 indicating the feature was up-weighted.
The `affine_map_style` argument is ignored.
However, if `mode=affine` was used, the QP-computed weights
correspond to columns of the PCA or NMF decomposition. In that
case, this functions maps them back to the primary dataset's
features. This can be done in three different ways, as specified
by the `affine_map_style` parameter.
You can build your own mapping from PCA/NMF weights to primary-modality feature
weights. The instance's `_wts` member is the numpy array that contains
QP-computed weights, and `_decomp_mdl` is the
sklearn-computed NMF/PCA decomposition. You can also look at the source code
of this function to get a sense of how to use them.
:type affine_map_style: string, one of 'softmax-avg' or 'top-k-rank' or 'top-k-loading', default='top-k-loading'
:param affine_map_style:
Governs how QP-computed weights for PCA/NMF columns are mapped
back to primary-modality features (typically, genes from a scRNA-seq dataset).
Default is 'top-k-loading', which considers only the
top-k PCA/NMF columns by QP-computed weight and computes the
average loading of a gene across these. The second argument specifies k (default=1)
Another choice is 'softmax-avg', which computes gene weights by a
softmax-type summation of loadings across the PCA/NMF columns,
with each column's weight proportional to exp(QP wt), and only
columns with QP weight > 1 being considered. *k is ignored here*.
Yet another choice is 'top-k-rank', which considers only the
top-k PCA/NMF columns by QP-computed weight and computes the
average rank of a gene across their loadings. The second argument specifies k (default=1)
In all approaches, PCA loadings are first converted to
absolute value, since PCA columns are unique up to a sign.
:type k: int, >= 0
:param k:
The number of PCA/NMF columns to average over, when affine_map_style = top-k-loading or top-k-rank.
*returns* : a vector of floats, the same size as the primary dataset's dimensionality
"""
if self._mode == "scale":
return self._wts
else:
if affine_map_style == 'softmax-avg':
w = np.exp(self._wts) # exponentiation of wts so the highest-wt features really stand out
w[ self._wts <= 1] = 0 # ignore features with wt <=1 (these were not up-weighted)
w = w/np.sum(w) # normalize
if self._params['decomposition_model'] == 'nmf':
df_comp = pd.DataFrame(self._decomp_mdl.components_)
else: #in PCA, loadings are unique upto a sign, so when adding them up, we just take the magnitudes
df_comp = pd.DataFrame(self._decomp_mdl.components_).abs()
feat_wts = (df_comp* w[:,None]).sum(axis=0).values
return feat_wts
elif affine_map_style == "top-k-rank":
try:
assert k>0 and k < len(self._wts)
except:
raise ValueError("""Incorrect "k" argument for 'top-k-rank'""")
w = self._wts.copy()
not_topk_idx = w.argsort()[:-k]
w[not_topk_idx] = 0 # set all but the top-k PCA/NMF column weights to zero
w = w/np.sum(w) # normalize
if self._params['decomposition_model'] == 'nmf':
df_comp = pd.DataFrame(self._decomp_mdl.components_).rank(axis=1, pct=True)
else: #in PCA, loadings are unique upto a sign, so when adding them up, we just take the magnitudes
df_comp = pd.DataFrame(self._decomp_mdl.components_).abs().rank(axis=1, pct=True)
feat_wts = (df_comp* w[:,None]).sum(axis=0).values
return feat_wts
elif affine_map_style == "top-k-loading":
try:
assert k>0 and k < len(self._wts)
except:
raise ValueError("""Incorrect "k" argument for 'top-k-loading'""")
w = self._wts.copy()
not_topk_idx = w.argsort()[:-k]
w[not_topk_idx] = 0 # set all but the top-k PCA/NMF column weights to zero
w = w/np.sum(w) # normalize
if self._params['decomposition_model'] == 'nmf':
df_comp = pd.DataFrame(self._decomp_mdl.components_)
else: #in PCA, loadings are unique upto a sign, so when adding them up, we just take the magnitudes
df_comp = pd.DataFrame(self._decomp_mdl.components_).abs()
feat_wts = (df_comp* w[:,None]).sum(axis=0).values
return feat_wts
else:
raise ValueError(""" "style" needs to be one of 'top-k-loading' or 'softmax-avg' or 'top-k-rank'""")
[docs] def get_start_end_dist_correlations(self):
"""Return the starting and ending distance correlations between primary and secondary modalities
Note: the distance correlations reported out (even between the primary and secondary modalities) may vary
from run to run, since the underlying algorithm samples a set of point pairs to compute its estimates.
:returns: a tuple with 3 entries:
a) distance correlation between primary and transformed space. This should always be >= min_desired_corr but
it can be substantially greater than min_desired_corr if the optimal solution requires that.
b) vector of distance correlations between primary and secondary modalities, and
c) vector of distance correlations between transformed dataset and secondary modalities
"""
retval = {}
for k in ["distcorr", "inp_sec_modality_corrs", "out_sec_modality_corrs"]:
retval[k] = self._soln_info.get(k,None)
return retval["distcorr"], retval["inp_sec_modality_corrs"], retval["out_sec_modality_corrs"]
[docs] def explore_param_mincorr(self,
d, secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list = None,
min_desired_corr_values = [0.999, 0.99, 0.95, 0.9, 0.8, 0.5, 0.2, 0],
addl_fit_kwargs = {},
addl_feature_weights_kwargs = {}):
"""Helper function to explore multiple choices of the min_desired_corr param.
For a range of min_desired_corr parameter values, it performs a `fit`, gets the `feature_weights`, and also the
achieved distance correlation between the transformed data and the primary/secondary modalities. While this
method is simply a convenience wrapper around other public methods, it is nonetheless useful for exploring the
best choice of min_desired_corr for your application. For example, if you're doing batch correction and hence set
a secondary modality's wt to be negative, you want the distance correlation of batch information and
transformed data to go to zero, not beyond that into negative correlation territory.
This function can help you identify an appropriate min_desired_corr value.
The required arguments are the same as those for a call to `fit` (which this method calls, under the hood).
The default list of possible values for min_desired_corr is a good place to start.
:type d: Numpy 2-d `array` or Pandas `dataframe`
:param d:
Same as in `fit`: the primary dataset (e.g. scanpy/anndata's .X).
:type secondary_data_val_list: list of 1-d or 2-d Numpy arrays or Pandas series, each with same number of rows as `d`
:param secondary_data_val_list:
Same as in `fit`: the secondary datasets you want to align the primary data towards.
:type secondary_data_type_list: list of strings
:param secondary_data_type_list:
Same as in `fit`: the datatypes of the secondary modalities.
:type secondary_data_wt_list: list of floats, **optional**
:param secondary_data_wt_list:
Same as in `fit`: user-specified wts for each secondary dataset (default= list of 1's)
:type min_desired_corr_values: list of floats, each value v being 0 <= v < 1
:param min_desired_corr_values:
list of min_desired_corr values to explore. The default is [0.999, 0.99, 0.95, 0.9, 0.8, 0.5, 0.2, 0]
:type addl_fit_kwargs: dict
:param addl_fit_kwargs:
additional named arguments passed on to fit(...)
:type addl_feature_weights_kwargs: dict
:param addl_feature_weights_kwargs:
named arguments passed on to feature_weights(...)
:returns: a tuple with 4 entries. In the first 3 below, each row of the dataframe corresponds to a min_desired_corr value
a) Dataframe of starting and ending distance correlations (see `get_start_end_dist_correlations` for details)
b) Dataframe of feature weights, produced by a call to `feature_weights`
c) Dataframe of QP solution wts. Same as feature weights if mode='scale', otherwise this corresponds to the QP-computed wts in the PCA/NMF space
d) Dictionary of SchemaQP objects, keyed by the min_desired_corr parameter. You can use them for `transform` calls.
"""
try:
assert isinstance(min_desired_corr_values,list) and len(min_desired_corr_values) > 0
for a in min_desired_corr_values:
assert 0 <= a < 1
except:
raise ValueError( 'mincorr_values should be a non-empty list of floats v, with 0 <= v < 1')
mcvals = list(reversed(sorted(min_desired_corr_values)))
ret_sqp, ret_sqpwts, ret_featwts, ret_distcorrs = {}, [], [], []
prev_sqp = self
for i, mc in enumerate(mcvals):
print("Evaluating min_desired_corr: {}".format(mc))
sqp1 = copy.deepcopy(prev_sqp)
sqp1.reset_mincorr_param(mc)
sqp1.fit(d, secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list, **addl_fit_kwargs)
prev_sqp = sqp1
ret_sqp[mc] = sqp1
ret_sqpwts.append( [mc] + list(sqp1._wts))
ret_featwts.append( [mc] + list(sqp1.feature_weights(**addl_feature_weights_kwargs)))
prim_distcorr, inp_sec_distcorrs, out_sec_distcorrs = sqp1.get_start_end_dist_correlations()
ret_distcorrs.append([mc, prim_distcorr] + inp_sec_distcorrs + out_sec_distcorrs)
df_sqpwts = pd.DataFrame(ret_sqpwts,
columns = ['min_corr'] + ['QPdim_{}'.format(i+1) for i in range(len(ret_sqpwts[0])-1)])
df_featwts = pd.DataFrame(ret_featwts,
columns = ['min_corr'] + ['PrimFeat_{}'.format(i+1) for i in range(len(ret_featwts[0])-1)])
df_distcorrs = pd.DataFrame(ret_distcorrs,
columns = (['min_corr', 'prim_vs_output_distcorr'] +
['prim_vs_sec{}_distcorr'.format(i+1) for i in range(len(secondary_data_wt_list))] +
['output_vs_sec{}_distcorr'.format(i+1) for i in range(len(secondary_data_wt_list))]))
#return df_distcorrs, df_featwts, ret_sqp #skipping df_sqpwts to avoid confusion with df_featwts
return df_distcorrs, df_featwts, df_sqpwts, ret_sqp
###################################################################
######## "private" methods below. Not that Python cares... ########
###################################################################
def _getDistances(self, D, d0_in, G, nPointPairs):
"""
Compute the various distances between point pairs in D
D is a NxK Numpy type 2-d array, with N points, each of K dimensions
d0_in could be None, in which D is used to compute distances for the original space.
Otherwise, d0_in is Nxr (r>=1) array and original-space distances are computed from this.
G is list, with each entry a 3-tuple: (g_val, g_type, gamma_g)
each 3-tuple corresponds to one dimension of side-information
g_val is Nx1 Numpy type 1-d vector of values for the N points, in the same order as D
g_type is "numeric", "categorical", or "feature_vector"
gamma_g is the relative wt you want to give this column. You can leave
it as None for all 3-tuples, but not for just some. If you leave them
all as None, the system will set wts to 1
nPointPairs is the number of point pairs you want to evaluate over. If it is None,
the system will generate over all possible pairs
"""
##############################
# symbology
# uv_dist = nPointPairs x K matrix of squared distances between point pairs, along K dimensions
# uv_dist_mean = 1 x K vector: avg of uv_dist along each dimension
# uv_dist_centered = nPointPairs x K matrix with uv_dist_mean subtracted from each row
# d0 = nPointPairs x 1 vector: squared distances in the current space
# z_d0 = z-scored version of d0
# dg = nPointPairs x 1 vector: squared distances (or class-match scores) for grouping g
# z_dg = z-scored version of dg
##############################
schema_info ("Flag 232.12 ", len(G), G[0][0].shape, G[0][1], G[0][2])
gamma_gs_are_None = True
for i,g in enumerate(G):
g_val, g_type, gamma_g = g
assert g_type in ['categorical','numeric','feature_vector']
if i==0 and gamma_g is not None:
gamma_gs_are_None = False
if gamma_g is None: assert gamma_gs_are_None
if gamma_g is not None: assert (not gamma_gs_are_None)
N, K = D.shape[0], D.shape[1]
if nPointPairs is not None:
j_u = np.random.randint(0, N, int(3*nPointPairs))
j_v = np.random.randint(0, N, int(3*nPointPairs))
valid = j_u < j_v #get rid of potential duplicates (x1,x2) and (x2,x1) as well as (x1,x1)
i_u = (j_u[valid])[:nPointPairs]
i_v = (j_v[valid])[:nPointPairs]
else:
x = pd.DataFrame.from_records(list(itertools.combinations(range(N),2)), columns=["u","v"])
x = x[x.u < x.v]
i_u = x.u.values
i_v = x.v.values
# NxK matrix of square distances along dimensions
if int(self._params.get("d0_type_is_feature_vector_categorical",1)) > 0.5:
uv_dist = (D[i_u,:] == D[i_v,:]).astype(np.float64)
else:
uv_dist = (D[i_u,:].astype(np.float64) - D[i_v,:].astype(np.float64))**2
# scale things so QP gets ~1-3 digit numbers, very large or very small numbers cause numerical issues
uv_dist = uv_dist/ (np.sqrt(uv_dist.shape[0])*uv_dist.ravel().mean())
# center uv_dist along each dimension
uv_dist_mean = uv_dist.mean(axis=0) #Kx1 vector, this is the mean of (u_i - v_i)^2, not sqrt((u_i - v_i)^2)
uv_dist_centered = uv_dist - uv_dist_mean
# square distances in the current metric space
if d0_in is None:
d0 = uv_dist.sum(axis=1) # Nx1
else:
dx0 = (d0_in[i_u] - d0_in[i_v])**2
if dx0.ndim==2:
d0 = dx0.sum(axis=1)
else:
d0 = dx0
d0_orig = d0.copy()
f_dist_d0 = self._params["d0_dist_transform"]
self._params["d0_orig_transformed_corr"] = 1.0
if f_dist_d0 is not None:
d0 = f_dist_d0(d0)
cr = (np.corrcoef(d0_orig, d0))[0,1]
if cr<0:
# it's an error if you specify a d0 such that corr( pairwisedist(d), pairwise(d0)) is negative to begin with
schema_error("d0_dist_transform inverts the correlation structure {0}.".format(cr))
raise ValueError("""d0_dist_transform inverts the correlation structure. Aborting...
It's an error if you specify a d0 such that corr( pairwisedist(d), pairwise(d0)) is negative to begin with""")
self._params["d0_orig_transformed_corr"] = cr
z_d0 = (d0 - d0.mean())/d0.std()
schema_info("Flag 2090.20 Initial corr to d0", np.corrcoef(uv_dist_centered.sum(axis=1), z_d0))
l_z_dg = []
for ii, gx in enumerate(G):
g_val, g_type, gamma_g = gx
if self._params["secondary_data_dist_transform_list"] is not None:
f_dist_dg = self._params["secondary_data_dist_transform_list"][ii]
else:
f_dist_dg = None
schema_info ("Flag 201.80 ", g_val.shape, g_type, gamma_g, f_dist_dg)
if g_type == "categorical":
dg = 1.0*( g_val[i_u] != g_val[i_v]) #1.0*( g_val[i_u].toarray() != g_val[i_v].toarray())
elif g_type == "feature_vector":
schema_info ("Flag 201.82 ", g_val[i_u].shape, g_val[i_v].shape)
dg = np.ravel(np.sum(np.power(g_val[i_u].astype(np.float64) - g_val[i_v].astype(np.float64),2), axis=1))
elif g_type == "feature_vector_categorical":
schema_info ("Flag 201.84 ", g_val[i_u].shape, g_val[i_v].shape)
dg = np.ravel(np.sum((g_val[i_u] == g_val[i_v]).astype(np.float64), axis=1))
else: #numeric
dg = (g_val[i_u].astype(np.float64) - g_val[i_v].astype(np.float64))**2 #(g_val[i_u].toarray() - g_val[i_v].toarray())**2
if f_dist_dg is not None:
dg = f_dist_dg(dg)
z_dg = (dg - dg.mean())/dg.std()
gwt = 1
if gamma_g is not None:
z_dg *= gamma_g
gwt = gamma_g
l_z_dg.append((gwt, z_dg))
schema_info ("Flag 201.99 ", uv_dist_centered.shape, z_d0.shape, len(l_z_dg), l_z_dg[0][1].shape)
schema_info ("Flag 201.991 ", uv_dist_centered.mean(axis=0))
schema_info ("Flag 201.992 ", uv_dist_centered.std(axis=0))
schema_info ("Flag 201.993 ", z_d0[:10], z_d0.mean(), z_d0.std())
schema_info ("Flag 201.994 ", l_z_dg[0][0], l_z_dg[0][1][:10], l_z_dg[0][1].mean(), l_z_dg[0][1].std())
return (uv_dist_centered, z_d0, l_z_dg)
def _prepareQPterms(self, uv_dist_centered, z_d0, l_z_dg):
nPointPairs = uv_dist_centered.shape[0]
l_wq, l_q = [], []
for i, (gwt, z_dg) in enumerate(l_z_dg):
l_wq.append( np.sum(uv_dist_centered * z_dg[:,None], axis=0) )
l_q.append( np.sum(uv_dist_centered * z_dg[:,None]/gwt, axis=0) )
q1 = l_wq[0]
for v in l_wq[1:]:
q1 += v
P1 = np.matmul( uv_dist_centered.T, uv_dist_centered)
g1 = np.sum(uv_dist_centered * z_d0[:,None], axis=0)
h1 = np.sum(g1)
return (P1, q1, g1, h1, l_q, nPointPairs)
def _computeSolutionFeatures(self, w, P1, q1, g1, l_q, nPointPairs):
K = len(w)
newmetric_sd = np.sqrt( np.matmul(np.matmul(np.reshape(w,(1,K)), P1), np.reshape(w,(K,1)))[0] / nPointPairs)
oldnew_corr = ((np.dot(w,g1)/nPointPairs)/newmetric_sd) / (self._params["d0_orig_transformed_corr"])
groupcorr_score = (np.dot(w,q1)/nPointPairs)/newmetric_sd
out_sec_modality_corrs = [ ((np.dot(w,qx)/nPointPairs)/newmetric_sd)[0] for qx in l_q]
w_ones = np.ones_like(w)
oldmetric_sd = np.sqrt( np.matmul(np.matmul(np.reshape(w_ones,(1,K)), P1), np.reshape(w_ones,(K,1)))[0] / nPointPairs)
inp_sec_modality_corrs = [ ((np.dot(w_ones,qx)/nPointPairs)/oldmetric_sd)[0] for qx in l_q]
schema_debug("Flag 485.30 ", newmetric_sd, oldnew_corr, groupcorr_score, out_sec_modality_corrs, inp_sec_modality_corrs)
return {"w": w, "distcorr": oldnew_corr[0],
"objval": groupcorr_score[0],
"out_sec_modality_corrs": out_sec_modality_corrs,
"inp_sec_modality_corrs": inp_sec_modality_corrs}
def _doQPSolve(self, P1, q1, g1, h1, l_q, nPointPairs, lambda1, alpha, beta):
#https://cvxopt.org/examples/tutorial/qp.html and https://courses.csail.mit.edu/6.867/wiki/images/a/a7/Qp-cvxopt.pdf
# the cvx example switches P & q to Q & p
from cvxopt import matrix, solvers
solvers.options["show_progress"] = False
K = len(g1)
I_K = np.diagflat(np.ones(K).astype(np.float64))
#P = 2* (lambda1*matrix(I_K) + beta*matrix(P1))
P = matrix(2* ((I_K*lambda1) + (P1*beta)))
q = -matrix(q1 + 2*lambda1*np.ones(K))
G0 = np.zeros((K+1,K)).astype(np.float64)
for i in range(K):
G0[i,i] = 1.0
G0[-1,:] = g1
G = -matrix(G0) #first K rows say -w_i <= 0. The K+1'th row says -corr(newdist,olddist) <= -const
h = matrix(np.zeros(K+1))
h[-1] = -alpha*h1
schema_debug("Flag 543.70 ", P1.shape, q1.shape, g1.shape, h1, nPointPairs, lambda1 , alpha, beta, P.size, q.size, G0.shape, G.size, h.size) #K, P.size, q.size, G.size, h.size)
sol=solvers.qp(P, q, G, h)
solvers.options["show_progress"] = True
w = np.array(sol['x']).ravel()
s = self._computeSolutionFeatures(w, P1, q1, g1, l_q, nPointPairs)
return s
def _summarizeSoln(self, soln, free_params):
retl = []
retl.append(("w_max_to_avg", (max(soln["w"])/np.nanmean(soln["w"]))))
retl.append(("w_num_zero_dims", sum(soln["w"] < 1e-5)))
retl.append(("d0_orig_transformed_corr", self._params["d0_orig_transformed_corr"]))
retl.append(("distcorr", soln["distcorr"]))
retl.append(("inp_sec_modality_corrs", soln["inp_sec_modality_corrs"]))
retl.append(("out_sec_modality_corrs", soln["out_sec_modality_corrs"]))
retl.append(("objval", soln["objval"]))
retl.append(("lambda", free_params["lambda"]))
retl.append(("alpha", free_params["alpha"]))
retl.append(("beta", free_params["beta"]))
return retl
def _doQPiterations(self, P1, q1, g1, h1, l_q, nPointPairs, max_w_wt, min_desired_oldnew_corr):
solutionList = []
schema_info('Progress bar (each dot is 10%): ', end='', flush=True)
alpha = 1.0 #start from one (i.e. no limit on numerator and make it bigger)
while alpha > 1e-5:
soln, param_settings = self._iterateQPLevel1(P1, q1, g1, h1, l_q, nPointPairs, max_w_wt, min_desired_oldnew_corr, alpha)
if soln is not None and soln["distcorr"] >= min_desired_oldnew_corr:
solutionList.append((-soln["objval"], soln, param_settings))
alpha -= 0.1
schema_info('.', end='', flush=True)
try:
solutionList.sort(key=lambda v: v[0]) #find the highest score
schema_info(' Done\n', end='', flush=True)
return (solutionList[0][1], solutionList[0][2])
except:
schema_info(' Done\n', end='', flush=True)
#raise
return (None, {})
def _iterateQPLevel1(self, P1, q1, g1, h1, l_q, nPointPairs, max_w_wt, min_desired_oldnew_corr, alpha):
solutionList = []
beta = 1e6 #start from a large value and go towards zero (a large value means the denominator will need to be small)
while beta > 1e-6:
try:
soln, param_settings = self._iterateQPLevel2(P1, q1, g1, h1, l_q, nPointPairs, max_w_wt, alpha, beta)
except Exception as e:
schema_info ("Flag 110.50 crashed in _iterateQPLevel2. Trying to continue...", P1.size, q1.size, g1.size, max_w_wt, alpha, beta)
schema_info(e)
beta *= 0.5
continue
if soln["distcorr"] >= min_desired_oldnew_corr:
solutionList.append((-soln["objval"], soln, param_settings))
schema_debug ("Flag 110.55 ", beta, len(solutionList), solutionList[0])
beta *= 0.5
try:
solutionList.sort(key=lambda v: v[0]) #find the highest score
schema_debug ("Flag 110.60 beta: ", "NONE" if not solutionList else self._summarizeSoln(solutionList[0][1], solutionList[0][2]))
return (solutionList[0][1], solutionList[0][2])
except:
return (None, {})
def _iterateQPLevel2(self, P1, q1, g1, h1, l_q, nPointPairs, max_w_wt, alpha, beta):
lo=1e-10 #1e-6
solLo = self._doQPSolve(P1, q1, g1, h1, l_q, nPointPairs, 1/lo, alpha, beta)
scalerangeLo = (max(solLo["w"])/np.nanmean(solLo["w"]))
hi=1e9 #1e6
solHi = self._doQPSolve(P1, q1, g1, h1, l_q, nPointPairs, 1/hi, alpha, beta)
scalerangeHi = (max(solHi["w"])/np.nanmean(solHi["w"]))
if scalerangeHi < max_w_wt:
solZero = self._doQPSolve(P1, q1, g1, h1, l_q, nPointPairs, 0, alpha, beta)
scalerangeZero = (max(solZero["w"])/np.nanmean(solZero["w"]))
if scalerangeZero < max_w_wt and int(self._params.get("require_nonzero_lambda",0))==0:
return solZero, {"lambda": 0, "alpha": alpha, "beta": beta}
else:
return solHi, {"lambda": 1/hi, "alpha": alpha, "beta": beta}
if scalerangeLo > max_w_wt: return solLo, {"lambda": 1/lo, "alpha": alpha, "beta": beta}
niter=0
while (niter < 60 and (hi/lo -1)>0.001):
mid = np.exp((np.log(lo)+np.log(hi))/2)
solMid = self._doQPSolve(P1, q1, g1, h1, l_q, nPointPairs, 1/mid, alpha, beta)
scalerangeMid = (max(solMid["w"])/np.nanmean(solMid["w"]))
if (scalerangeLo <= max_w_wt <= scalerangeMid):
hi = mid
scalerangeHi = scalerangeMid
solHi = solMid
else:
lo = mid
scalerangeLo = scalerangeMid
solLo = solMid
niter += 1
schema_debug ("Flag 42.113 ", niter, lo, mid, hi, max(solLo["w"]), min(solLo["w"]), max(solHi["w"]), min(solHi["w"]), scalerangeMid)
return solLo, {"lambda": 1/lo, "alpha": alpha, "beta": beta}
def _fit_helper(self, dx, d0, secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list):
nPointPairs = self._params.get("dist_npairs", 2000000)
sample_rate = self._params.get("dist_df_frac", 1.0)
w_max_to_avg= self._w_max_to_avg
min_desired_corr = self._min_desired_corr
N = dx.shape[0]
schema_info ("Flag 102.30 ", dx.shape, secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list)
G = []
for i in range(len(secondary_data_val_list)):
G.append((secondary_data_val_list[i].copy(), secondary_data_type_list[i], None if secondary_data_wt_list is None else secondary_data_wt_list[i]))
if sample_rate < 0.9999999:
idx = np.random.choice(N, size=int(sample_rate*N), replace=False)
dx = dx[idx,:]
if d0 is not None:
d0 = d0[idx,:]
for i, gx in enumerate(G):
G[i] = (gx[0][idx], gx[1], gx[2])
schema_info ("Flag 102.35 ", dx.shape, secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list)
uv_dist_centered, z_d0, l_z_dg = self._getDistances(dx, d0, G, nPointPairs)
schema_info ("Flag 102.36 ", uv_dist_centered.shape, z_d0.shape, len(l_z_dg))
P1, q1, g1, h1, l_q, nPointPairs1 = self._prepareQPterms(uv_dist_centered, z_d0, l_z_dg)
schema_info ("Flag 102.37 ")
soln, free_params = self._doQPiterations(P1, q1, g1, h1, l_q, nPointPairs1, w_max_to_avg, min_desired_corr)
if soln is None:
raise Exception("Couldn't find valid solution to QP")
schema_info ("Flag 102.40 Final solution: ", self._summarizeSoln(soln, free_params))
return soln["w"].ravel(), self._summarizeSoln(soln, free_params)
def _fit_scale(self, d, d0, secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list):
dx1 = d.copy()
if self._params.get("scale_mode_uses_standard_scaler",0)>0:
self._std_scaler = sklearn.preprocessing.StandardScaler()
dx = self._std_scaler.fit_transform(dx1)
else:
self._std_scaler = sklearn.preprocessing.StandardScaler(with_mean=False, with_std=False)
dx = self._std_scaler.fit_transform(dx1) #this is a little wasteful no-op but makes a code a little easier to manage
print('Running quadratic program...', end='', flush=True)
s, sl = self._fit_helper(dx, d0, secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list)
self._wts = np.maximum(s,0) # shouldn't be <0 anyways, except for numerical issues
self._wts = len(self._wts)*self._wts/np.sum(self._wts) #let's normalize so that avg wt = 1
self._soln_info = dict(sl)
print(' done.\n', end='', flush=True)
def _fit_affine(self, d, d0, secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list):
do_whiten = self._params.get("do_whiten",1)>0
ncomp = self._params.get("num_top_components",None) #default is all
model_type = self._params.get("decomposition_model","pca").lower()
dx1 = d.copy()
if self._decomp_mdl is None:
print('Running change-of-basis transform ({0}, {1} components)...'.format(model_type, ncomp), end='', flush=True)
if model_type=="pca":
if not scipy.sparse.issparse(dx1):
self._decomp_mdl = sklearn.decomposition.PCA(n_components=ncomp, whiten=do_whiten)
else:
self._decomp_mdl = sklearn.decomposition.TruncatedSVD(n_components=ncomp)
schema_warning("Using TruncatedSVD instead of PCA because input is a sparse matrix. do_whiten arguments will be ignored")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self._decomp_mdl.fit(dx1)
elif model_type=="nmf":
self._decomp_mdl = sklearn.decomposition.NMF(n_components=ncomp, random_state=0, alpha=0, l1_ratio=0)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self._decomp_mdl.fit(dx1)
self._orig_decomp_mdl = copy.deepcopy(self._decomp_mdl)
print(' done.')
else:
print('Reusing previous change-of-basis transformation')
self._decomp_mdl = copy.deepcopy(self._orig_decomp_mdl)
if model_type=="pca":
dx = self._decomp_mdl.transform(dx1)
elif model_type=="nmf":
W = self._decomp_mdl.transform(dx1)
if do_whiten:
H = self._decomp_mdl.components_
wsd = np.ravel(np.sqrt(np.sum(W**2, axis=0))) #W.std(axis=0)
W = W/wsd[None,:]
self._decomp_mdl.components_ *= wsd[:,None]
# hsd = np.sqrt(np.sum(H**2, axis=1))
# H = H/hsd[:,None]
# self._decomp_mdl.components_ = H
# W = W*hsd[None,:]
dx = W
print('Running quadratic program...', end='', flush=True)
s, sl = self._fit_helper(dx, d0, secondary_data_val_list, secondary_data_type_list, secondary_data_wt_list)
self._wts = np.maximum(s,0) # shouldn't be <0 anyways, except for numerical issues
self._wts = len(self._wts)*self._wts/np.sum(self._wts) #let's normalize so that avg wt = 1
self._soln_info = dict(sl)
print(' done.\n', end='', flush=True)