Source code for nanite.rate.rater

import pathlib
from pkg_resources import resource_filename

import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import FunctionTransformer


from .features import IndentationFeatures
from .regressors import reg_dict, reg_names, reg_trees


[docs]class IndentationRater(IndentationFeatures): def __init__(self, regressor=None, scale=None, lda=None, training_set=None, names=None, weight=True, sample_weight=None, *args, **kwargs): """Rate quality Parameters ---------- regressor: sciki-learn RegressorMixin The regressor used for rating scale: bool If True, apply a Standard Scaler. If a regressor based on decision trees is used, the Standard Scaler is not used by default, otherwise it is. lda: bool If True, apply a Linear Discriminant Analysis (LDA). If a regressor based on a decision tree is used, LDA is not used by default, otherwise it is. training_set: tuple of (X, y) The training set (samples, response) names: list of str Feature names to use weight: bool Weight the input samples by the number of occurrences or with `sample_weight`. For tree-based classifiers, set this to True to avoid bias. sample_weight: list-like The sample weights. If set to `None` sample weights are computed from the training set. *args: list Positional arguments for :class:`IndentationFeatures` **kwargs: Keyword arguments for :class:`IndentationFeatures` See Also -------- sklearn.preprocessing.StandardScaler: Standard scaler sklearn.discriminant_analysis.LinearDiscriminantAnalysis: Linear discriminant analysis nanite.rate.regressors.reg_trees: List of regressors that are identified as tree-based """ if regressor is not None: _name = regressor.__class__.__name__ if lda is None: lda = False if _name in reg_trees else True if scale is None: scale = False if _name in reg_trees else True # training set if training_set is None: # default training_set = self.load_training_set(names=names) # sample weights if sample_weight is None: sample_weight = self.compute_sample_weight(*training_set) steps = [] # scaling (does not affect decision trees / random forests) if scale: steps.append(StandardScaler()) # linear discriminant analysis if lda: steps.append(LinearDiscriminantAnalysis()) if regressor is not None: steps.append(regressor) if len(steps) == 0: dummy = FunctionTransformer(lambda x: x) steps.append(dummy) #: sklearn pipeline with transforms (and regressor if given) self.pipeline = make_pipeline(*steps) fit_params = {} if regressor is not None and weight: # set weighting for regressor key = "{}__sample_weight".format(self.pipeline.steps[-1][0]) fit_params[key] = sample_weight if regressor is not None: self.pipeline.fit(*training_set, **fit_params) names = self.get_feature_names(names=names, which_type="all") #: feature names used by the regressor pipeline self.names = sorted(names) super(IndentationRater, self).__init__(*args, **kwargs) def _pre_rate(self, bsample): """exclude based on boolean training set""" if np.sum(bsample == 0): # bad curve return False else: # good curve return True def _rate(self, sample): gd = self.pipeline.predict(np.atleast_2d(sample)) return gd[0]
[docs] @staticmethod def compute_sample_weight(X, y): """Weight samples according to occurrence in y""" if not np.all(np.array(y, dtype=int) == y): msg = "Only integer ratings allowed." raise NotImplementedError(msg) weight = np.zeros(y.shape[0], dtype=float) for ii in range(11): idxii = y == ii occur = np.sum(idxii) if occur: # Sometimes the training set is not large enough. # If no occurences were found, the weights remain # zero. weight[idxii] = 1 / occur # normalize weight /= np.sum(weight) return weight
[docs] @staticmethod def get_training_set_path(label="zef18"): """Return the path to a training set shipped with nanite Training sets are stored in the `nanite.rate` module path with ``ts_`` prepended to `label`. """ data_loc = "nanite.rate" resp_path = resource_filename(data_loc, "ts_{}".format(label)) return resp_path
[docs] @classmethod def load_training_set(cls, path=None, names=None, which_type=["continuous"], remove_nan=True, ret_names=False): """Load a training set from a directory By default, only the "continuous" features are imported. The "binary" features are not needed for training; they are used to sort out new force-distance data. """ fnames = cls.get_feature_names(which_type=which_type, names=names) sample_paths = [] if path is None: path = cls.get_training_set_path() path = pathlib.Path(path).resolve() resp_path = str(path / "train_response.txt") for fn in fnames: resf = str(path / "train_{}.txt".format(fn)) sample_paths.append(resf) samples = [np.loadtxt(sp, dtype=float, ndmin=2) for sp in sample_paths] samples = np.concatenate(samples, axis=1) response = np.loadtxt(resp_path, dtype=float) if remove_nan: # Remove nan-values from training set valid = ~np.isnan(np.sum(samples, axis=1)) samples = samples[valid, :] # remove corresponding responses response = response[valid] res = [samples, response] if ret_names: res.append(fnames) return res
[docs] def rate(self, samples=None, datasets=None): """Perform rating step Parameters ---------- samples: 1d or 2d ndarray (cast to 2d ndarray) or None Measured samples, if set to None, `dataset` must be given. dataset: list of nanite.Indentation Full, fitted measurement Returns ------- ratings: list Resulting ratings """ if samples is None and datasets is None: # use dataset from IndentationFeature datasets = [self.dataset] elif datasets is None: # distinguish between binary and other samples fsamples = [] bsamples = [] fnames = self.get_feature_names( names=self.names, which_type=["continuous"]) for samp in samples: fsamp = [] # continuous samples bsamp = [] # binary samples for ii, name in enumerate(self.names): if name in fnames: fsamp.append(samp[ii]) else: assert name.startswith("feat_bin_") bsamp.append(samp[ii]) fsamples.append(fsamp) bsamples.append(bsamp) else: if not isinstance(datasets, (list, tuple)): datasets = [datasets] fsamples = [] bsamples = [] # continuous features for idnt in datasets: samp = self.compute_features( idnt=idnt, names=self.names, which_type=["continuous"]) fsamples.append(samp) # binary features for idnt in datasets: bsamp = self.compute_features(idnt=idnt, names=self.names, which_type="binary") bsamples.append(bsamp) fsamples = np.atleast_2d(fsamples) bsamples = np.atleast_2d(bsamples) ratings = [] for bsamp, fsamp in zip(bsamples, fsamples): if not self._pre_rate(bsamp): # certainly a bad curve gd = 0 elif np.isnan(np.sum(fsamp)): # ignore nan-valued samples gd = -1 else: gd = self._rate(fsamp) ratings.append(gd) return np.array(ratings).flatten()
[docs]def get_available_training_sets(): """List of internal training sets""" data_loc = "nanite" resp_path = resource_filename(data_loc, "rate") avail = [] for pp in pathlib.Path(resp_path).glob("ts_*"): avail.append(pp.name[3:]) return sorted(avail)
[docs]def get_rater(regressor, training_set="zef18", names=None, lda=None, **reg_kwargs): """Convenience method to get a rater Parameters ---------- regressor: str or RegressorMixin If a string, must be in `reg_names`. training_set: str or pathlib.Path or tuple (X, y) A string label representing a training set shipped with nanite, the path to a training set, or a tuple representing the training set (samples, response) for use with sklearn. Returns ------- irater: nanite.IndentationRater The rating instance. """ avr = get_available_training_sets() if isinstance(training_set, tuple): pass else: if training_set in avr: ts_path = IndentationRater.get_training_set_path( label=training_set) else: ts_path = training_set training_set = IndentationRater.load_training_set( path=ts_path, names=names) if len(training_set) != 2: raise ValueError("Expected training_set of the form (X, y)!") if isinstance(regressor, str): if regressor not in reg_names: msg = "Unknown regressor name: '{}'!".format(regressor) \ + " Please pass your own sklearn RegressorMixin." raise ValueError(msg) reg_cl, default_kw = reg_dict[regressor] kw = default_kw.copy() kw.update(reg_kwargs) regr = reg_cl(**kw) else: regr = regressor rater = IndentationRater(regressor=regr, training_set=training_set, names=names, lda=lda) return rater