import pathlib
from pkg_resources import resource_filename
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import FunctionTransformer
from .features import IndentationFeatures
from .regressors import reg_dict, reg_names, reg_trees
[docs]class IndentationRater(IndentationFeatures):
def __init__(self, regressor=None, scale=None, lda=None,
training_set=None, names=None,
weight=True, sample_weight=None,
*args, **kwargs):
"""Rate quality
Parameters
----------
regressor: sciki-learn RegressorMixin
The regressor used for rating
scale: bool
If True, apply a Standard Scaler. If a regressor based on
decision trees is used, the Standard Scaler is not used
by default, otherwise it is.
lda: bool
If True, apply a Linear Discriminant Analysis (LDA). If a
regressor based on a decision tree is used, LDA is not
used by default, otherwise it is.
training_set: tuple of (X, y)
The training set (samples, response)
names: list of str
Feature names to use
weight: bool
Weight the input samples by the number of occurrences
or with `sample_weight`. For tree-based classifiers, set this
to True to avoid bias.
sample_weight: list-like
The sample weights. If set to `None` sample weights
are computed from the training set.
*args: list
Positional arguments for :class:`IndentationFeatures`
**kwargs:
Keyword arguments for :class:`IndentationFeatures`
See Also
--------
sklearn.preprocessing.StandardScaler:
Standard scaler
sklearn.discriminant_analysis.LinearDiscriminantAnalysis:
Linear discriminant analysis
nanite.rate.regressors.reg_trees:
List of regressors that are identified as tree-based
"""
if regressor is not None:
_name = regressor.__class__.__name__
if lda is None:
lda = False if _name in reg_trees else True
if scale is None:
scale = False if _name in reg_trees else True
# training set
if training_set is None:
# default
training_set = self.load_training_set(names=names)
# sample weights
if sample_weight is None:
sample_weight = self.compute_sample_weight(*training_set)
steps = []
# scaling (does not affect decision trees / random forests)
if scale:
steps.append(StandardScaler())
# linear discriminant analysis
if lda:
steps.append(LinearDiscriminantAnalysis())
if regressor is not None:
steps.append(regressor)
if len(steps) == 0:
dummy = FunctionTransformer(lambda x: x)
steps.append(dummy)
#: sklearn pipeline with transforms (and regressor if given)
self.pipeline = make_pipeline(*steps)
fit_params = {}
if regressor is not None and weight:
# set weighting for regressor
key = "{}__sample_weight".format(self.pipeline.steps[-1][0])
fit_params[key] = sample_weight
if regressor is not None:
self.pipeline.fit(*training_set, **fit_params)
names = self.get_feature_names(names=names, which_type="all")
#: feature names used by the regressor pipeline
self.names = sorted(names)
super(IndentationRater, self).__init__(*args, **kwargs)
def _pre_rate(self, bsample):
"""exclude based on boolean training set"""
if np.sum(bsample == 0):
# bad curve
return False
else:
# good curve
return True
def _rate(self, sample):
gd = self.pipeline.predict(np.atleast_2d(sample))
return gd[0]
[docs] @staticmethod
def compute_sample_weight(X, y):
"""Weight samples according to occurrence in y"""
if not np.all(np.array(y, dtype=int) == y):
msg = "Only integer ratings allowed."
raise NotImplementedError(msg)
weight = np.zeros(y.shape[0], dtype=float)
for ii in range(11):
idxii = y == ii
occur = np.sum(idxii)
if occur:
# Sometimes the training set is not large enough.
# If no occurences were found, the weights remain
# zero.
weight[idxii] = 1 / occur
# normalize
weight /= np.sum(weight)
return weight
[docs] @staticmethod
def get_training_set_path(label="zef18"):
"""Return the path to a training set shipped with nanite
Training sets are stored in the `nanite.rate`
module path with ``ts_`` prepended to `label`.
"""
data_loc = "nanite.rate"
resp_path = resource_filename(data_loc, "ts_{}".format(label))
return resp_path
[docs] @classmethod
def load_training_set(cls, path=None, names=None,
which_type=["continuous"],
remove_nan=True, ret_names=False):
"""Load a training set from a directory
By default, only the "continuous" features are imported. The
"binary" features are not needed for training; they are used
to sort out new force-distance data.
"""
fnames = cls.get_feature_names(which_type=which_type, names=names)
sample_paths = []
if path is None:
path = cls.get_training_set_path()
path = pathlib.Path(path).resolve()
resp_path = str(path / "train_response.txt")
for fn in fnames:
resf = str(path / "train_{}.txt".format(fn))
sample_paths.append(resf)
samples = [np.loadtxt(sp, dtype=float, ndmin=2) for sp in sample_paths]
samples = np.concatenate(samples, axis=1)
response = np.loadtxt(resp_path, dtype=float)
if remove_nan:
# Remove nan-values from training set
valid = ~np.isnan(np.sum(samples, axis=1))
samples = samples[valid, :]
# remove corresponding responses
response = response[valid]
res = [samples, response]
if ret_names:
res.append(fnames)
return res
[docs] def rate(self, samples=None, datasets=None):
"""Perform rating step
Parameters
----------
samples: 1d or 2d ndarray (cast to 2d ndarray) or None
Measured samples, if set to None, `dataset` must be given.
dataset: list of nanite.Indentation
Full, fitted measurement
Returns
-------
ratings: list
Resulting ratings
"""
if samples is None and datasets is None:
# use dataset from IndentationFeature
datasets = [self.dataset]
elif datasets is None:
# distinguish between binary and other samples
fsamples = []
bsamples = []
fnames = self.get_feature_names(
names=self.names,
which_type=["continuous"])
for samp in samples:
fsamp = [] # continuous samples
bsamp = [] # binary samples
for ii, name in enumerate(self.names):
if name in fnames:
fsamp.append(samp[ii])
else:
assert name.startswith("feat_bin_")
bsamp.append(samp[ii])
fsamples.append(fsamp)
bsamples.append(bsamp)
else:
if not isinstance(datasets, (list, tuple)):
datasets = [datasets]
fsamples = []
bsamples = []
# continuous features
for idnt in datasets:
samp = self.compute_features(
idnt=idnt,
names=self.names,
which_type=["continuous"])
fsamples.append(samp)
# binary features
for idnt in datasets:
bsamp = self.compute_features(idnt=idnt,
names=self.names,
which_type="binary")
bsamples.append(bsamp)
fsamples = np.atleast_2d(fsamples)
bsamples = np.atleast_2d(bsamples)
ratings = []
for bsamp, fsamp in zip(bsamples, fsamples):
if not self._pre_rate(bsamp):
# certainly a bad curve
gd = 0
elif np.isnan(np.sum(fsamp)):
# ignore nan-valued samples
gd = -1
else:
gd = self._rate(fsamp)
ratings.append(gd)
return np.array(ratings).flatten()
[docs]def get_available_training_sets():
"""List of internal training sets"""
data_loc = "nanite"
resp_path = resource_filename(data_loc, "rate")
avail = []
for pp in pathlib.Path(resp_path).glob("ts_*"):
avail.append(pp.name[3:])
return sorted(avail)
[docs]def get_rater(regressor, training_set="zef18", names=None,
lda=None, **reg_kwargs):
"""Convenience method to get a rater
Parameters
----------
regressor: str or RegressorMixin
If a string, must be in `reg_names`.
training_set: str or pathlib.Path or tuple (X, y)
A string label representing a training set shipped with
nanite, the path to a training set, or a tuple
representing the training set (samples, response)
for use with sklearn.
Returns
-------
irater: nanite.IndentationRater
The rating instance.
"""
avr = get_available_training_sets()
if isinstance(training_set, tuple):
pass
else:
if training_set in avr:
ts_path = IndentationRater.get_training_set_path(
label=training_set)
else:
ts_path = training_set
training_set = IndentationRater.load_training_set(
path=ts_path,
names=names)
if len(training_set) != 2:
raise ValueError("Expected training_set of the form (X, y)!")
if isinstance(regressor, str):
if regressor not in reg_names:
msg = "Unknown regressor name: '{}'!".format(regressor) \
+ " Please pass your own sklearn RegressorMixin."
raise ValueError(msg)
reg_cl, default_kw = reg_dict[regressor]
kw = default_kw.copy()
kw.update(reg_kwargs)
regr = reg_cl(**kw)
else:
regr = regressor
rater = IndentationRater(regressor=regr,
training_set=training_set,
names=names,
lda=lda)
return rater