Source code for nanite.fit

import copy
import hashlib
import warnings

import lmfit
import numpy as np
import scipy.signal as spsig

from . import model


FP_DEFAULT = dict(model_key="hertz_para",
                  optimal_fit_edelta=False,
                  optimal_fit_num_samples=100,
                  params_initial=None,
                  preprocessing=[],
                  range_type="absolute",
                  range_x=[0, 0],
                  segment="approach",
                  weight_cp=1e-6,
                  x_axis="tip position",
                  y_axis="force",
                  )

FP_RESULTS = ["chi_sqr",
              "hash",
              "optimal_fit_delta_array",
              "optimal_fit_delta",
              "optimal_fit_E_array",
              "params_fitted",
              "success",
              "xmax",
              "xmin",
              ]


[docs]class FitKeyError(BaseException):
    pass


[docs]class FitDataError(BaseException):
    pass


[docs]class FitWarning(UserWarning):
    pass


[docs]class FitProperties(dict):
    """Fit property manager class

    Provide convenient access to fit properties as a dictionary
    and dynamically manage resets due to new initial parameters.

    Dynamic properties include:

    - set "params_initial" to `None` if the "model_key" changes
    - remove all keys except those in `FP_DEFAULT` if a key that is
      in `FP_DEFAULT` changes (All other keys are considered to be
      obsolete fitting results).

    Additional attributes:

    - "segment_bool": bool
        `False` for "approach" and `True` for "retract"
    """

    def __getitem__(self, key):
        if key == "segment_bool":
            if self["segment"] == "approach":
                return False
            if self["segment"] == "retract":
                return True
            else:
                msg = "Unknown segment: {}".format(self["segment"])
                raise FitKeyError(msg)
        else:
            return super(FitProperties, self).__getitem__(key)

    def __setitem__(self, key, value):
        if key in FP_DEFAULT:
            if (key in self
                and key == "params_initial"
                    and self["params_initial"] is not None
                    and value is not None):
                # check for changed initial parameters
                for pp in self["params_initial"]:
                    s1 = self["params_initial"][pp].__getstate__()
                    s2 = value[pp].__getstate__()
                    if s1 != s2:
                        self.reset()
                        break
            elif key in self and self[key] == value:
                # nothing to do, parameters match
                pass
            else:
                if key == "model_key":
                    # Other model has other parameters.
                    self["params_initial"] = None
                elif key == "range_x":
                    if ("optimal_fit_edelta" in self and
                        self["optimal_fit_edelta"] and
                        "range_x" in self and
                            self["range_x"][1] == value[1]):
                        # Ignore changes in range[0]
                        return
                # Trigger `self.reset`
                self.reset()
        elif key not in FP_RESULTS:
            msg = "Key '{}' not in FP_DEFAULT".format(key)
            raise FitKeyError(msg)
        super(FitProperties, self).__setitem__(key, value)

[docs]    def reset(self):
        for key in list(self.keys()):
            if key not in FP_DEFAULT:
                self.pop(key)

[docs]    def restore(self, props):
        """update the dictionary without removing any keys"""
        for key in props:
            super(FitProperties, self).__setitem__(key, props[key])


[docs]class IndentationFitter(object):
    def __init__(self, idnt, **kwargs):
        """Fit force-distance curves

        Parameters
        ----------
        idnt: nanite.indent.Indentation
            The dataset to fit
        model_key: str
            A key referring to a model in
            `nanite.model.models_available`
        params_initial: instance of lmfit.Parameters
            Parameters for fitting. If not given,
            default parameters are used.
        range_x: tuple of 2
            The range for fitting, see `range_type` below.
        range_type: str
            One of:

            - absolute:
                Set the absolute fitting range in values
                given by the `x_axis`.
            - relative cp:
                In some cases it is desired to be able to
                fit a model only up until a certain indentation
                depth (tip position) measured from the contact
                point. Since the contact point is a fit parameter
                as well, this requires a two-pass fitting.
        preprocessing: list of str
            Preprocessing
        segment: str
            One of "approach" or "retract".
        weight_cp: float
            Weight the contact point region which shows artifacts
            that are difficult to model with e.g. Hertz.
        optimal_fit_edelta: bool
            Search for the optimal fit by varying the maximal
            indentation depth and determining a plateau in the
            resulting Young's modulus (fitting parameter "E").
        optimal_fit_num_samples: int
            Number of samples to use for searching the optimal fit
        """
        # Get initial fit parameters
        # IMPORTANT:
        # If there are new additions in the default values,
        # make sure to take these into account in `FP_DEFAULT`.
        self.fp = FitProperties(**FP_DEFAULT)

        # Get parameters from dataset
        # (sorted, such that `model_key` is set before `params_initial`)
        for key in sorted(idnt.fit_properties.keys()):
            if key in FP_DEFAULT:
                self.fp[key] = idnt.fit_properties[key]
        # Get parameters from kwargs
        # (sorted, such that `model_key` is set before `params_initial`)
        for key in sorted(self.fp.keys()):
            if key in kwargs:
                if key not in FP_DEFAULT:
                    msg = "Key '{}' not in FP_DEFAULT".format(key)
                    raise FitKeyError(msg)
                self.fp[key] = kwargs[key]
        # Set initial fitting parameters
        if self.fp["params_initial"] is None:
            self.fp["params_initial"] = self.get_initial_parameters(
                idnt=idnt,
                model_key=self.fp["model_key"]
            )

        # Set arrays
        self.segment = (idnt["segment"] == self.fp["segment_bool"])
        self.segment.setflags(write=False)

        self.x_axis = idnt[self.fp["x_axis"]]
        self.x_axis.setflags(write=False)

        self.y_axis = idnt[self.fp["y_axis"]]
        self.y_axis.setflags(write=False)

        self.fit_range = np.zeros_like(self.segment)
        self.fit_curve = np.zeros_like(self.y_axis)
        self.fit_residuals = np.zeros_like(self.y_axis)

        # Fitting parameters might be changed due to iterative fitting.
        # Store them in separate variables to prevent resetting the
        # `FitPropreties` instance `self.fp`.
        self.optimal_fit_edelta = self.fp["optimal_fit_edelta"]
        self.range_type = self.fp["range_type"]
        self.range_x = list(self.fp["range_x"])

        self.hash = self._hash()
        self.fp["hash"] = self.hash

        # Perform sanity checks
        if self.fp["range_type"] not in ["absolute", "relative cp"]:
            msg = "`range_type` must be  'absolute' or 'relative cp'!"
            raise FitKeyError(msg)
        if len(self.fp["range_x"]) != 2:
            raise FitKeyError("`range_x` must have length 2!")
        if (np.isnan(self.fp["range_x"][0]) or
                np.isnan(self.fp["range_x"][1])):
            raise FitKeyError("`range_x` must not contain NaN!")
        if self.fp["segment"] not in ["approach", "retract"]:
            msg = "`segment` must be  'approach' or 'retract'!"
            raise FitKeyError(msg)
        if self.fp["model_key"] not in model.models_available:
            msg = "unknown model '{}'".format(self.fp["model_key"])
            raise FitKeyError(msg)

        if self.fp["optimal_fit_edelta"]:
            # Make sure we have emodulus
            if "E" not in self.fp["params_initial"]:
                msg = "Search for optimal fit requires the parameter 'E'!"
                raise FitKeyError(msg)
            # This only works with absolute range
            if self.fp["range_type"] != "absolute":
                msg = "Only absolute range relative to contact point allowed!"
                raise FitKeyError(msg)

        md_key = self.fp["model_key"]
        md = model.models_available[md_key]
        params = md.get_parameter_defaults()
        for p in params:
            msg = "Unknown fitting parameter '{}' for model '{}'!"
            if p not in self.fp["params_initial"]:
                raise FitKeyError(msg.format(p, md_key))

        if self.fp["range_x"][0] > self.fp["range_x"][1]:
            msg = "Fitting range is inverted: {}".format(self.fp["range_x"])
            warnings.warn(msg, FitWarning)

[docs]    def compute_emodulus_vs_mindelta(self, callback=None):
        """Compute elastic modulus vs. minimal indentation curve"""
        segid = self.segment
        xseg = self.x_axis[segid]
        yseg = self.y_axis[segid]

        xmax = np.max(self.fp["range_x"])
        if np.isinf(xmax):
            xmax = np.max(xseg)
        # Disable and remember `optimal_fit_edelta`
        optimal_fit_edelta = self.optimal_fit_edelta
        self.optimal_fit_edelta = False

        # We are agnostic concerning the direction of indentation.
        # `xseg` should start at the baseline.
        seems_approach = np.average(yseg[:10]) < np.average(yseg[-10:])
        if seems_approach and self.fp["segment"] == "approach":
            if xseg[0] < xseg[-1]:
                msg = "Unexpected trend in approach x data!"
                raise FitDataError(msg)
        elif seems_approach:
            msg = "Data appears to be 'approach', but is 'retract'!"
            raise FitDataError(msg)
        elif not seems_approach and self.fp["segment"] == "retract":
            if xseg[0] > xseg[-1]:
                msg = "Unexpected trend in retract x data!"
                raise FitDataError(msg)
            msg = "Unexpected trend in retract curve!"
            raise FitDataError(msg)
        elif not seems_approach:
            msg = "Data appears to be 'retract', but is 'approach'!"
            raise FitDataError(msg)

        # Fit the range of parameters
        xmin = xseg.min()
        if xmin >= 0:
            msg = "No negative values (indentation) found! " \
                  + "Did you correct for tip offset?"
            raise FitKeyError(msg)
        num_samp = self.fp["optimal_fit_num_samples"]
        indentations = np.linspace(xmin, xmin*.05, num_samp)
        emoduli = np.zeros_like(indentations)
        for ii, x0 in enumerate(indentations):
            # Perform a fit and record the elastic modulus
            self.range_x = [x0, xmax]
            self.fit()
            emoduli[ii] = self.fp["params_fitted"]["E"].value
            if callback and ii % 5 == 0:
                callback(emoduli, indentations)

        self.optimal_fit_edelta = optimal_fit_edelta

        return emoduli, indentations

[docs]    @staticmethod
    def compute_opt_mindelta(emoduli, indentations):
        """Determine the plateau of an emodulus-indentation curve

        The following procedure is performed:

        1. Smooth the emodulus data with a Butterworth filter
        2. Label sequences that have similar values by binning
           into ten regions between the min and max.
        3. Ignore sequences with emodulus that is smaller than
           the binning size.
        4. Determine the longest sequence.
        """
        # Perform smoothing of the curve
        # First, perform filtering with butterworth filter
        nb = 1      # Filter order
        wb = 0.05    # Cutoff frequency
        b, a = spsig.butter(nb, wb, output='ba')
        smooth_e = spsig.filtfilt(b, a, emoduli)
        # Second, determine the longest sequence of values
        # that have the same smoothed value.
        ni = 10
        ivals, istep = np.linspace(smooth_e.min(), smooth_e.max(), ni,
                                   endpoint=False, retstep=True)
        ivals += istep/2
        labelarray = np.zeros_like(smooth_e, dtype=int)
        valarray = np.zeros_like(smooth_e, dtype=int)
        # label each sequence with an individual `idx`
        for ii in range(smooth_e.shape[0]):
            valid = np.argmin(np.abs(ivals - smooth_e[ii]))
            if ii == 0:
                idx = 0
            elif valid == valarray[ii-1]:
                pass
            else:
                idx += 1
            labelarray[ii] = idx
            valarray[ii] = valid
        # Determine the longest sequence
        counts = list(np.bincount(labelarray))
        # Ignore values that are below cutoff

        for ii in range(len(counts)):
            labmax = np.argmax(counts)
            labid = np.where(labelarray == labmax)[0][0]
            valmax = ivals[valarray[labid]]
            if valmax > istep:
                break
            counts.pop(labmax)
        else:
            # Nothing found: select the middle value
            warnings.warn("Could not find correct plateau.", FitWarning)
            labmax = 5
        # Determine the interval in the original array
        indices = np.where(labelarray == labmax)[0]
        if len(indices) == 1:
            dopt = indentations[indices[0]]
        else:
            # compute optimal indentation as center
            dopt = np.average(indentations[indices[0]:indices[-1]])
        return dopt

    def _fit(self):
        """Fit a model to the data

        Notes
        -----
        This method is private because it requires the array
        `self.fit_range` before the actual fitting.
        """
        model_key = self.fp["model_key"]
        params_initial = self.fp["params_initial"]
        weight_cp = self.fp["weight_cp"]

        # boolean array indexing the segment
        segid = self.segment
        # x: the entire segment
        xseg = self.x_axis[segid]
        # y: the entire segment
        yseg = self.y_axis[segid]
        # x: the values being fitted
        x = self.x_axis[self.fit_range]
        # y: the values being fitted
        y = self.y_axis[self.fit_range]

        md = model.models_available[model_key]

        # short reference for better readability
        fit_cur = self.fit_curve
        fit_res = self.fit_residuals

        # reset values to nan
        fit_cur[:] = np.nan
        fit_res[:] = np.nan

        # Make sure that we can actually fit by comparing variable fitting
        # parameters and size of x.
        npvaried = np.sum([p[1].vary for p in list(params_initial.items())])
        if npvaried < x.shape[0] - 1:
            # perform fit
            fit = lmfit.minimize(
                md.residual, params_initial, args=(x, y, weight_cp))
            # fitted method
            fit_cur[segid] = md.model(fit.params, xseg)
            # residuals
            fit_res[segid] = md.residual(fit.params, xseg, yseg, weight_cp)
            # add fit results to fp dictionary
            self.fp.update({"params_fitted": fit.params,
                            "chi_sqr": fit.chisqr,
                            "xmin": x.min(),
                            "xmax": x.max(),
                            "success": True,
                            })
        else:
            self.fp["success"] = False

[docs]    def fit(self):
        """Fit the approach-retract data to a model function
        """
        range_type = self.range_type
        range_x = copy.copy(self.range_x)
        # Set to True in `self._fit`
        self.fp["success"] = False

        if self.optimal_fit_edelta:
            # emodulus-indentation curve:
            emoduli, indentations = self.compute_emodulus_vs_mindelta()
            # determine optimal x0
            dopt = self.compute_opt_mindelta(emoduli, indentations)
            # add indentations/emoduli to self.fp
            self.fp["optimal_fit_E_array"] = emoduli
            self.fp["optimal_fit_delta_array"] = indentations
            self.fp["optimal_fit_delta"] = dopt
            # update fitting parameters for final fit
            self.range_x = [dopt, np.max(self.fp["range_x"])]
            # perform final fit
            self.optimal_fit_edelta = False
            self.fit()
            self.optimal_fit_edelta = True

        elif self.range_type == "absolute":
            # This is easy. Simply set the boolean array of fitting values
            # Exclude data points from other segment
            if range_x[0] != range_x[1]:
                x_data = self.x_axis.copy()
                range_bool = self.segment.copy()
                rmin, rmax = np.min(range_x), np.max(range_x)
                range_bool[x_data < rmin] = False
                range_bool[x_data > rmax] = False
            else:
                range_bool = self.segment
            self.fit_range[:] = range_bool
            self._fit()

        elif range_type == "relative cp":
            # Let's try to solve this with four passes
            # First, get the approximate contact point
            self.range_type = "absolute"
            # Set full range to get estimate of cp
            self.range_x = [0, 0]
            self.fit()
            # Do the following three-times.
            for _i in range(3):
                # get the fitted contact point
                cp = self.fp["params_fitted"]["contact_point"].value
                self.range_x = list(np.array(range_x)+cp)
                # Second, fit with the new contact point as range parameters
                self.fit()

        # Reset range data to original values
        self.range_type = range_type
        self.range_x = range_x

[docs]    def get_initial_parameters(self, idnt=None,
                               model_key=FP_DEFAULT["model_key"]):
        """Get initial fit parameters for a specific model

        Parameters
        ----------

        """
        if (model_key == self.fp["model_key"] and
                self.fp["params_initial"] is not None):
            params = self.fp["params_initial"]
        else:
            params = guess_initial_parameters(idnt=idnt, model_key=model_key)
        return params

    def _hash(self):
        """Compute hash identifier for current fit

        The hash is computed without applying the fit, making
        it suitable for caching fits.
        """
        hashlist = []
        # preprocessing
        hashlist.append(self.fp["preprocessing"])
        # axes data
        hashlist.append(self.x_axis)
        hashlist.append(self.y_axis)
        # fit parameters
        for key in FP_DEFAULT:
            if (key == "range_x" and
                    self.fp["optimal_fit_edelta"]):
                # range only partly if "optimal_fit_edelta" is True
                hashlist.append(self.fp["range_x"][1])
            elif (key == "optimal_fit_num_samples" and
                  not self.fp["optimal_fit_edelta"]):
                # ignore number of samples if optimal fit is not used
                pass
            else:
                hashlist.append(self.fp[key])
        # join and hash
        myhash = hashlib.md5(obj2str(hashlist)).hexdigest()
        return myhash


[docs]def guess_initial_parameters(idnt=None,
                             model_key=FP_DEFAULT["model_key"],
                             common_ancillaries=True,
                             model_ancillaries=True):
    """Guess initial fitting parameters

    Parameters
    ----------
    idnt: nanite.indent.Indentation
        The dataset to use for guessing initial fitting parameters
        using ancillary parameters
    model_key: str
        The model key
    common_ancillaries: bool
        Guess global ancillary parameters (such as contact point)
    model_ancillaries: bool
        Use model-related ancillary parameters
    """
    md = model.models_available[model_key]
    params = md.get_parameter_defaults()
    if common_ancillaries and idnt is not None:
        # Guess initial contact point from actual tip position
        # (see `self.compute_tip_position`)
        # Depending on how the current dataset was pre-processed,
        # the column "tip position" might already be corrected by
        # an estimated contact point offset.
        # (see `self.compute_tip_offset`)
        if "tip position" in idnt:
            cpid = idnt.estimate_contact_point_index()
            cp = idnt["tip position"][cpid]
            params["contact_point"].set(cp)
        else:
            msg = "Cannot estimate contact point, because of missing "\
                  + "column 'tip position'"
            warnings.warn(msg, FitWarning)
    if model_ancillaries and idnt is not None:
        anc_dict = idnt.get_ancillary_parameters()
        # set the parameter values
        for anckey in anc_dict:
            if anckey in params:
                if not np.isnan(anc_dict[anckey]):  # ignore nans
                    params[anckey].set(value=anc_dict[anckey])
    return params


[docs]def obj2str(obj):
    """String representation of an object for hashing"""
    if isinstance(obj, str):
        return obj.encode("utf-8")
    elif isinstance(obj, (bool, int, float)):
        return str(float(obj)).encode("utf-8")
    elif obj is None:
        return b"none"
    elif isinstance(obj, np.ndarray):
        return obj.tostring()
    elif isinstance(obj, tuple):
        return obj2str(list(obj))
    elif isinstance(obj, list):
        return b"".join(obj2str(o) for o in obj)
    elif isinstance(obj, dict):
        return obj2str(list(obj.items()))
    elif isinstance(obj, lmfit.parameter.Parameter):
        return obj2str([obj.value, obj.max, obj.min, obj.vary,
                        obj.expr, obj.name])
    else:
        raise ValueError("No rule to convert object '{}' to string.".
                         format(obj.__class__))