Source code for multidim

# coding=utf-8

r"""
The `multidim` class provides user-facing tools for topological data analysis
of multi-dimensional data.

The goal is to be honest about topology while also using speed/cleverness with
a minimal amount of user headache.

Included are:
    - `PointCloud`, for data points in Euclidean space.
    - `SimplicialComplex`, for abstract simplicial complexes, built
      from `Simplex` objects sorted by dimension into `SimplexStratum` objects.


Copyright
---------
- This file is part of https://github.com/geomdata/gda-public/
- 2015, 2016, 2017 by Geometric Data Analytics, Inc. (http://geomdata.com)
- AGPL license. See `LICENSE` or https://github.com/geomdata/gda-public/blob/master/LICENSE

Examples
--------

>>> X = np.load("tests/circle.npy")
>>> pc = PointCloud(X, max_length=-1)
>>> pc
A SimplicialComplex with 1000 points, 499500 edges, and 0 faces.
>>> np.all(pc.stratum[0]['pos'].values == True)
True
>>> pc.check()
>>> pc.make_pers0(cutoff=0.15)
>>> for v in pc.cells(0):
...     if v.positive:
...         print(v)
0+ Simplex 0 of height 0.0 and mass 1.0
0+ Simplex 74 of height 0.0 and mass 1.0
0+ Simplex 183 of height 0.0 and mass 1.0
0+ Simplex 195 of height 0.0 and mass 1.0
0+ Simplex 197 of height 0.0 and mass 1.0
0+ Simplex 231 of height 0.0 and mass 1.0
0+ Simplex 354 of height 0.0 and mass 1.0
0+ Simplex 397 of height 0.0 and mass 1.0
0+ Simplex 489 of height 0.0 and mass 1.0
0+ Simplex 530 of height 0.0 and mass 1.0
0+ Simplex 607 of height 0.0 and mass 1.0
0+ Simplex 757 of height 0.0 and mass 1.0
0+ Simplex 781 of height 0.0 and mass 1.0
0+ Simplex 800 of height 0.0 and mass 1.0
0+ Simplex 903 of height 0.0 and mass 1.0
0+ Simplex 980 of height 0.0 and mass 1.0
>>> pc.pers0.grab(5)['keepcode']
     birth_index  death_index  birth     death      pers
979          213          316    0.0  0.136923  0.136923
980          135          135    0.0  0.136992  0.136992
981          439          477    0.0  0.138059  0.138059
982          610          630    0.0  0.138474  0.138474
983          603          603    0.0  0.139332  0.139332
>>> pc.make_pers1_rca1(cutoff=0.2)
>>> pc.pers1.grab(5)['keepcode']
     birth_index  death_index     birth     death      pers
221         3217         9700  0.095619  0.168120  0.072501
220         2942         9661  0.091542  0.167720  0.076177
219         2713         9279  0.087438  0.164152  0.076713
224         3333        10439  0.097564  0.174643  0.077079
200         1816         7688  0.071490  0.149336  0.077846
>>> V=pc.stratum[0]
>>> V.loc[:10]
    height  mass    pos  rep
0      0.0   1.0   True    0
1      0.0   1.0  False    0
2      0.0   1.0  False    1
3      0.0   1.0  False    0
4      0.0   1.0  False    0
5      0.0   1.0  False    4
6      0.0   1.0  False    1
7      0.0   1.0  False    0
8      0.0   1.0  False    0
9      0.0   1.0  False    0
10     0.0   1.0  False    1
>>> pc.cells(0)[0]
0+ Simplex 0 of height 0.0 and mass 1.0
>>> pc.cells(0)[2]
0- Simplex 2 of height 0.0 and mass 1.0
>>> E=pc.stratum[1]
>>> E.loc[:10]
      height    pos  rep  bdy0  bdy1
0   0.001142  False    0   858   866
1   0.001997  False    1    98   187
2   0.002471  False    2   251   313
3   0.002670  False    3   599   629
4   0.002766  False    4   150   167
5   0.003405  False    5   573   620
6   0.003812  False    6   474   517
7   0.005357  False    7   893   988
8   0.005533  False    8   623   644
9   0.005914  False    9   648   744
10  0.006056  False   10   612   640
>>> pc.cells(1)[2]
1- Simplex 2 of height 0.0024707293775457456 and mass None
"""

from __future__ import print_function

from collections import defaultdict
import itertools
import numpy as np
import pandas as pd
# Don't roll our own L2 norms
from scipy.spatial.distance import squareform, cdist, pdist, is_valid_dm
from . import fast_algorithms
import homology.dim0
import homology.dim1


[docs]class Simplex(object): r""" This class is a convenient container to access the data in the pd DataFrame stratum[dim] of a SimplicialComplex. It is always faster to access stratum[dim].loc[index] directly. Parameters ---------- cellcomplex : :class:`SimplicialComplex` The SimplicialComplex to which this Simplex belongs. dim : int The dimension in which this Simplex lives. index : int The abstract index of this Simplex. Attributes ---------- cellcomplex : `SimplicialComplex` The `SimplicialComplex` to which this Simplex belongs. dim : int The dimension in which this Simplex lives. index : int The abstract index of this Simplex. shadow_complex : `SimplicialComplex` children : :class:`pandas.Series` See Also -------- SimplicialComplex : A class for abstract simpicial cell complexes. Notes ----- A finite (*abstract*) *simplicial complex* is a finite set :math:`A` together with collection :math:`\Delta` of subsets of :math:`A` such that if :math:`X \in \Delta` and :math:`Y \subset X` then :math:`Y \in \Delta`. [1]_ References ---------- .. [1] D. Feichtner-Kozlov, Combinatorial algebraic topology. Berlin: Springer, 2008. """ def __init__(self, cellcomplex, dim, index): self.cellcomplex = cellcomplex self.dim = dim self.index = index self.shadow_complex = None self.children = pd.Series(dtype=np.float64) @property def height(self): r""" :return: height (that is, filtered value) of this cell (np.float64) """ return self.cellcomplex.stratum[self.dim]['height'].loc[self.index] @height.setter def height(self, v): self.cellcomplex.stratum[self.dim]['height'].loc[self.index] = v @property def mass(self): r""" :return: mass of this cell (np.float64 or None) """ if 'mass' in self.cellcomplex.stratum[self.dim]: return self.cellcomplex.stratum[self.dim]['mass'].loc[self.index] else: return None @mass.setter def mass(self, v): self.cellcomplex.stratum[self.dim]['mass'].loc[self.index] = v @property def positive(self): r""" :return: """ return self.cellcomplex.stratum[self.dim]['pos'].loc[self.index] @positive.setter def positive(self, b): r""" :param b: """ self.cellcomplex.stratum[self.dim]['pos'].loc[self.index] = b @property def representative(self): r""" :return: """ return self.cellcomplex.stratum[self.dim]['rep'].loc[self.index] @representative.setter def representative(self, r): r""" :param r: """ self.cellcomplex.stratum[self.dim]['rep'].loc[self.index] = r @property def boundary(self): r""" :return: """ parts = [] if self.dim > 0: parts = range(self.dim + 1) return {self.cellcomplex.stratum[self.dim]['bdy{}'.format(j)].loc[self.index] for j in parts} @boundary.setter def boundary(self, s): r""" :param s: """ for i, c in enumerate(sorted(list(s))): self.cellcomplex.stratum[self.dim]['bdy{}'.format(i)].loc[ self.index] = c def __hash__(self): r""" :param self: :return: """ return self.index def __eq__(self, other): r""" :param other: :return: """ return self.cellcomplex == other.cellcomplex and self.__hash__() == other.__hash__() def __repr__(self): r""" :return: """ sign = "+" if not self.positive: sign = "-" return "{}{} Simplex {} of height {} and mass {}".format(self.dim, sign, self.index, repr(self.height), repr(self.mass)) def __lt__(self, other): if not (self.cellcomplex == other.cellcomplex): raise ValueError("These Cells are not in the same SimplicialComplex!") if not (self.dim == other.dim): raise ValueError("These Cells are not of the same dimension!") return self.height() < other.height()
[docs]class SimplexStratum(object): r""" SimplexStratum is a thin class for calling :class:`Simplex` objects of a certain dimension from a `SimplicialComplex`. It is an interface to the data in `SimplicialComplex.stratum`[dim], which is a `pandas.DataFrame`. Whenever possible, the `pandas.DataFrame` should be called directly, for speed. """ def __init__(self, cell_complex, dim): self.cell_complex = cell_complex self.dim = dim self._cells = dict() def __getitem__(self, i): if i not in self._cells: self._cells[i] = Simplex(self.cell_complex, self.dim, i) return self._cells[i] def __iter__(self): for i in self.cell_complex.stratum[self.dim].index: yield self[i] def __repr__(self): return "Stratum {} of SimplicialComplex {}".format(self.dim, id(self.cell_complex)) def __hash__(self): return id(self.cell_complex), self.dim def __eq__(self, other): return type(self) == other(self) and self.__hash__ == other.__hash__
def stratum_maker(dim=0): r""" Make an empty stratum :class:`pandas.DataFrame` of the appropriate dimension. This is used to initialize a new dimension of a :class:`SimplicialComplex`. Parameters ---------- dim : int Dimension of stratum (0, 1, 2, ...) Returns ------- DataFrame : :class:`pandas.DataFrame` pd DataFrame suitable for SimplicialComplex.stratum[dim] See Also -------- :class:`SimplicialComplex` : A class for abstract simplicial cell complexes. """ bdy_size = 0 if dim > 0: bdy_size = dim + 1 return pd.DataFrame({}, columns=['height', 'pos', 'rep'] + ['bdy{}'.format(i) for i in range(bdy_size)], index=range(0)) def stratum_from_distances(dists, max_length=-1.0, points=None): r""" Construct a stratum dictionary from a symmetric matrix of distances. Parameters ---------- dists : :class:`numpy.ndarray` A symmetric NxN array for distances, as obtained from :class:`scipy.spatial.distances.squareform` max_length : int If max_length >=0, store only those edges of length < max_length. Default: -1.0, store all edges. points : :class:`pandas.DataFrame` A fully-formed DataFrame of point information for stratum[0]. Returns ------- {0: points, 1: edges} : dict A stratum dictionary, suitable for SimplicialComplex objects. See Also -------- :func:`fast_algorithms.edges_from_dists` """ is_valid_dm(dists, throw=True) if points is None: n = dists.shape[0] idx0 = np.arange(n, dtype=np.int64) hgt0 = np.zeros(n, dtype=np.float64) pos0 = np.ones(shape=(n,), dtype='bool') points = pd.DataFrame({ 'height': hgt0, 'pos': pos0, 'rep': idx0, }, columns=['height', 'pos', 'rep'], index=idx0) if max_length == 0: # if the cutoff is 0, we don't want to bother to make all the # distances. edges = stratum_maker(1) else: hgt1, pos1, bdys = fast_algorithms.edges_from_dists(points.index.values, dists, cutoff=np.float64(max_length)) num_edges = hgt1.shape[0] idx1 = np.arange(num_edges, dtype='int64') edges = pd.DataFrame({ 'height': hgt1, 'pos': pos1, 'rep': idx1, 'bdy0': bdys[:, 0], 'bdy1': bdys[:, 1], }, columns=['height', 'pos', 'rep', 'bdy0', 'bdy1'], index=idx1) return {0: points, 1: edges} def lower_star_for_image(img_array): """ Compute the lower star weighted simplicial complex from a 2d grid/image. Parameters ---------- img_array, a `numpy.ndarray` of dimension 2. Returns ------- `homology.SimplicialComplex` Examples -------- >>> A = np.random.rand(3,4) >>> lower_star_for_image(A) A SimplicialComplex with 12 points, 23 edges, and 12 faces. """ assert len(img_array.shape) == 2,\ "Lower-star filtration is currently for images (2d arrays) only." m = img_array.shape[0] n = img_array.shape[1] # make all vertices, by flattening val_array and indexing in the normal way verts_hgt = img_array.flatten() verts_rep = np.arange(m*n) flat_index = verts_rep.reshape(m, n) edges_hgt = [] edges_rep = [] edges_bdy0 = [] edges_bdy1 = [] # Make all the horizontal edges. for i, j in itertools.product(range(m), range(n-1)): # collect vertices' indices and heights # left=(i,j) -- right=(i,j+1) lf_idx = flat_index[i, j] rt_idx = flat_index[i, j+1] # There is no real reason for these asserts -- just clarification. # assert lf_idx == n*(i) + (j) # assert rt_idx == n*(i) + (j+1) lf_hgt = img_array[i, j] rt_hgt = img_array[i, j+1] # There is no real reason for these asserts -- just clarification. # assert lf_hgt == verts_hgt[lf_idx] # assert rt_hgt == verts_hgt[rt_idx] edges_hgt.append(np.max([lf_hgt, rt_hgt])) edges_rep.append(len(edges_rep)) edges_bdy0.append(lf_idx) edges_bdy1.append(rt_idx) # This i,j horizontal edge should have index (n-1)*i + j assert len(edges_hgt) - 1 == (n-1)*i + j # did we count all horizontal edges? assert len(edges_hgt) == (n-1)*m # Make all the vertical edges for i, j in itertools.product(range(m-1), range(n)): # collect vertices' indices and heights # top=(i,j) # | # bot=(i+1,j) tp_idx = flat_index[i, j] bt_idx = flat_index[i+1, j] # There is no real reason for these asserts -- just clarification. # assert tp_idx == n*(i) + (j) # assert bt_idx == n*(i+1) + (j) tp_hgt = img_array[i, j] bt_hgt = img_array[i+1, j] # There is no real reason for these asserts -- just clarification. # assert tp_hgt == verts_hgt[tp_idx] # assert bt_hgt == verts_hgt[bt_idx] edges_hgt.append(np.max([tp_hgt, bt_hgt])) edges_rep.append(len(edges_rep)) edges_bdy0.append(tp_idx) edges_bdy1.append(bt_idx) # This i,j vertical edge should have index n*i + j # AFTER the (n-1)*m horizontal edges assert len(edges_hgt) - 1 == (n-1)*m + n*i + j # did we cound all vertical AND horizontal edges? assert len(edges_hgt) == (n-1)*m + n*(m-1) faces_hgt = [] faces_rep = [] faces_bdy0 = [] faces_bdy1 = [] faces_bdy2 = [] # Make the diagonal edges, and the faces, too. for i, j in itertools.product(range(m-1), range(n-1)): # collect the vertices' indices and heights # nw=(i,j) ne=(i, j+1) # at (i,j) # sw=(i+1, j) se=(i+1, j+1) nw_idx = flat_index[i, j] ne_idx = flat_index[i, j+1] se_idx = flat_index[i+1, j+1] sw_idx = flat_index[i+1, j] # There is no real reason for these asserts -- just clarification. # assert nw_idx == n*(i) + (j) # assert ne_idx == n*(i) + (j+1) # assert se_idx == n*(i+1) + (j+1) # assert sw_idx == n*(i+1) + (j) nw_hgt = img_array[i, j] ne_hgt = img_array[i, j+1] se_hgt = img_array[i+1, j+1] sw_hgt = img_array[i+1, j] # There is no real reason for these asserts -- just clarification. # assert nw_hgt == verts_hgt[nw_idx] # assert ne_hgt == verts_hgt[ne_idx] # assert se_hgt == verts_hgt[se_idx] # assert sw_hgt == verts_hgt[sw_idx] # determine diagonal cell_max_loc = np.argmax([nw_hgt, ne_hgt, se_hgt, sw_hgt]) if cell_max_loc % 2 == 0: # Max is either nw or se. # Make edge (nw,se) edges_hgt.append(np.max([nw_hgt, se_hgt])) edges_rep.append(len(edges_rep)) edges_bdy0.append(nw_idx) edges_bdy1.append(se_idx) # Make face (nw,ne,se). faces_hgt.append(np.max([nw_hgt, ne_hgt, se_hgt])) faces_rep.append(len(faces_rep)) faces_bdy0.append((n-1)*i + j) # horizontal nw-ne # assert edges_bdy0[ (n-1)*i + j ] == nw_idx # assert edges_bdy1[ (n-1)*i + j ] == ne_idx faces_bdy1.append((n-1)*m + n*i + j+1) # vertical ne|se # assert edges_bdy0[ (n-1)*m + n*i + j+1 ] == ne_idx # assert edges_bdy1[ (n-1)*m + n*i + j+1 ] == se_idx faces_bdy2.append(edges_rep[-1]) # most recent edge is nw\se # assert edges_bdy0[ edges_rep[-1] ] == nw_idx # assert edges_bdy1[ edges_rep[-1] ] == se_idx # Make face (sw,se,nw). faces_hgt.append(np.max([sw_hgt, se_hgt, nw_hgt])) faces_rep.append(len(faces_rep)) faces_bdy0.append((n-1)*(i+1) + j) # horizontal sw-se # assert edges_bdy0[ (n-1)*(i+1) + j ] == sw_idx # assert edges_bdy1[ (n-1)*(i+1) + j ] == se_idx faces_bdy1.append((n-1)*m + n*i + j) # vertical nw|sw # assert edges_bdy0[ (n-1)*m + n*i + j ] == nw_idx # assert edges_bdy1[ (n-1)*m + n*i + j ] == sw_idx faces_bdy2.append(edges_rep[-1]) # most recent edge is nw\se # assert edges_bdy0[ edges_rep[-1] ] == nw_idx # assert edges_bdy1[ edges_rep[-1] ] == se_idx else: # Max is either ne or sw. # Make edge (ne,sw) edges_hgt.append(np.max([ne_hgt, sw_hgt])) edges_rep.append(len(edges_rep)) edges_bdy0.append(ne_idx) edges_bdy1.append(sw_idx) # Make face (nw,ne,sw). faces_hgt.append(np.max([nw_hgt, ne_hgt, sw_hgt])) faces_rep.append(len(faces_rep)) faces_bdy0.append((n-1)*i + j) # horizontal nw-ne # assert edges_bdy0[ (n-1)*i + j ] == nw_idx # assert edges_bdy1[ (n-1)*i + j ] == ne_idx faces_bdy1.append((n-1)*m + n*i + j) # vertical nw|sw # assert edges_bdy0[ (n-1)*m + n*i + j ] == nw_idx # assert edges_bdy1[ (n-1)*m + n*i + j ] == sw_idx faces_bdy2.append(edges_rep[-1]) # most recent edge is ne\sw # assert edges_bdy0[ edges_rep[-1] ] == ne_idx # assert edges_bdy1[ edges_rep[-1] ] == sw_idx # Make face (sw,se,ne). faces_hgt.append(np.max([sw_hgt, se_hgt, ne_hgt])) faces_rep.append(len(faces_rep)) faces_bdy0.append((n-1)*(i+1) + j) # horizontal sw-se # assert edges_bdy0[ (n-1)*(i+1) + j ] == sw_idx # assert edges_bdy1[ (n-1)*(i+1) + j ] == se_idx faces_bdy1.append((n-1)*m + n*i + j+1) # vertical ne|se # assert edges_bdy0[ (n-1)*m + n*i + j+1 ] == ne_idx # assert edges_bdy1[ (n-1)*m + n*i + j+1 ] == se_idx faces_bdy2.append(edges_rep[-1]) # most recent edge is ne\sw # assert edges_bdy0[ edges_rep[-1] ] == ne_idx # assert edges_bdy1[ edges_rep[-1] ] == sw_idx verts_pos = np.ones_like(verts_hgt, dtype='bool') edges_pos = np.ones_like(edges_rep, dtype='bool') faces_pos = np.ones_like(faces_rep, dtype='bool') verts = pd.DataFrame({'height': verts_hgt, 'rep': verts_rep, 'pos': verts_pos}, columns=['height', 'pos', 'rep']) edges = pd.DataFrame({'height': edges_hgt, 'rep': edges_rep, 'pos': edges_pos, 'bdy0': edges_bdy0, 'bdy1': edges_bdy1}, columns=['height', 'pos', 'rep', 'bdy0', 'bdy1']) faces = pd.DataFrame({'height': faces_hgt, 'rep': faces_rep, 'pos': faces_pos, 'bdy0': faces_bdy0, 'bdy1': faces_bdy1, 'bdy2': faces_bdy2}, columns=['height', 'pos', 'rep', 'bdy0', 'bdy1', 'bdy2']) return SimplicialComplex(stratum={0: verts, 1: edges, 2: faces})
[docs]class SimplicialComplex(object): r""" A class for abstract *weighted* simplicial complexes. A SimplicialComplex is built from 0-cells (vertices), 1-cells (edges), 2-cells (faces), and so on. Each cell knows its boundary. A 0-cell has no boundary. A 1-cell has two 0-cells as its boundary. A 2-cell has three 1-cells as its boundary, and so on. Each cell *must* height value, called `height`. These heights are used in several topological algorithms that depend on filtering. Each cell *may* have a mass value, called `mass`. These masses are used in some data-analysis methods that involve weighted averaging or probability. Each cell can A SimplicialComplex has no notion of coordinates or embedding. For that, use the :class:`PointCloud` class, which inherits all methods from SimplicialComplex but also adds coordinate-dependent methods. Parameters ---------- stratum : dict Dictionary of :class:`pandas.DataFrame` objects holding vertices, edges, faces, and so on. See examples below. Notes ----- One can reference the :class:`Simplex` objects that are separated by dimension into :class:`SimplexStratum` objects. Each :class:`Simplex` object has a height, so these are *filtered* simplicial complexes. Each :class:`Simplex` object may have a mass, so these can be are *weighted* simplicial complexes. The **rep** and **pos** attributes are used when computing various homologies, such as the Rips or Cech complexes. Whenever possible, computations are done on :class:`numpy.ndarray` arrays in compiled code, so they are usually quite fast. Examples -------- For example, a formal triangle could be built this way: >>> vertices = pd.DataFrame({'height':[ 0.0, 0.0, 0.0], ... 'mass': [1.0, 1.0, 1.0], ... 'pos': [True, True, True], ... 'rep' : [0, 1, 2]}) >>> edges = pd.DataFrame({'height':[ 0.0, 0.0, 0.0], ... 'pos': [True, True, True], ... 'rep' : [0, 1, 2], ... 'bdy0': [0, 1, 2], ... 'bdy1': [1, 2, 0]}) >>> faces = pd.DataFrame({'height': [0.0], ... 'pos': [True], ... 'rep': [0], ... 'bdy0': [0], ... 'bdy1': [1], ... 'bdy2': [2]}) >>> T = SimplicialComplex(stratum = {0: vertices, 1: edges, 2: faces}) >>> print(T) A SimplicialComplex with 3 points, 3 edges, and 1 faces. """ def __init__(self, stratum=None): if stratum is not None: assert all(type(k) == int and k >= 0 for k in stratum.keys()), \ "The strata of a SimplicialComplex must be indexed by non-negative integers." self.stratum = defaultdict(stratum_maker, stratum) else: self.stratum = defaultdict(stratum_maker) self._nn = dict() self._cellstratum = dict() self.pers0 = None self.pers1 = None
[docs] @classmethod def from_distances(cls, dists, max_length=-1.0, points=None): r""" Construct a `SimplicialComplex` from a symmetric matrix of distances. Parameters ---------- dists : `numpy.ndarray` An N-by-N symmetric array, with 0s on the diagonal, as obtained from :func:`scipy.spatial.distances.squareform` max_length : float If :code:`max_length >= 0`, store only those edges of length less than :code:`max_length`. Default: -1, store all edges. points : `pandas.DataFrame` A fully-formed DataFrame of point information for stratum[0]. But, if you have that info, you probably want to use `PointCloud` instead. Returns ------- `SimplicialComplex` """ stratum = stratum_from_distances(dists, max_length, points) return cls(stratum=stratum)
[docs] def check(self): r"""Run consistency checks on all simplices in all dimensions. raises ValueError if anything is wrong. """ for dim in self.stratum.keys(): if dim > 0: valcheck = fast_algorithms.check_heights(self, dim) if not valcheck == -1: raise ValueError("Not a filtration! Check 'height' in ({}).stratum[{}].iloc[{}]".format(self, dim, valcheck))
def __repr__(self): return "A SimplicialComplex with {} points, {} edges, and {} faces.".format( len(self.stratum[0]), len(self.stratum[1]), len(self.stratum[2]))
[docs] def cells(self, dim): r""" iterate over all :class:`Simplex` objects of dimension dim. This is generated on-demand from the :class:`SimplexStratum`. """ if dim not in self._cellstratum: self._cellstratum[dim] = SimplexStratum(self, dim) return self._cellstratum[dim]
[docs] def reset(self): """ delete persistence diagrams, and forget all representative and positivity information. Use this before re-running :func:`make_pers0` or other methods in `homology` with a smaller cutoff. """ self.pers0 = None self.pers1 = None for dim in self.stratum.keys(): if self.stratum[dim] is None: self.stratum[dim] = stratum_maker(dim) self.stratum[dim]['rep'].values[:] = self.stratum[ dim].index.values # identity representation self.stratum[dim]['pos'].values[:] = True pass
[docs] def make_pers0(self, cutoff=-1.0): r"""Run the UnionFind algorithm to mark connected components of the SimplicialComplex. This marks points as positive/negative. It also marks the reprensetatives of points. It makes a PersDiag object with unionfind, saved as :code:`self.pers0` """ if (1 not in self.stratum.keys()) or len(self.stratum[1]) == 0: raise ValueError("This SimplicialComplex has no 1-stratum (edges). Persistence is meaningless.") try: if self.max_length >= 0.0 and cutoff > self.max_length: raise ValueError("Persistence cutoff is greater than max_length of pre-computed edges. This is meaningless.") except AttributeError: pass tbirth_index, tdeath_index, ybirth_index, ydeath_index, mergetree = homology.dim0.unionfind(self, np.float64(cutoff)) self.pers0 = homology.PersDiag(tbirth_index, tdeath_index, ybirth_index, ydeath_index, mergetree) pass
def sever(self): r""" Subdivide a SimplicialComplex or PointCloud into several smaller partitions, using the known 0-dimensional persistence diagram. This is an iterator (it _yields_ the terms). Two points end up in the same partition if and only if they are connected by a sequence of edges of length < cutoff. Yields ------ pairs (indices, subpointcloud) of persistently connected SimplicialComplexes/PointClouds. The first term gives indices of the these points from the original `PointCloud` The second term gives a new `PointCloud` with its own sequential index. Notes ----- This uses the 0-dimensional Persistence Diagram; therefore, you should run `self.reset()` and `self.make_pers0(cutoff)` first. See Also -------- :func:`make_pers0` :func:`reset` Examples -------- >>> pc = PointCloud(np.array([[0.,0.],[0.,0.5],[1.,0.],[5.,0.],[6.,0.],[5.,-0.6]]), max_length=-1.0) >>> pc.make_pers0(cutoff=1.9) >>> for indices,sub_pc in pc.sever(): ... print(indices) ... print(sub_pc) ... print(sub_pc.coords) [0 1 2] A SimplicialComplex with 3 points, 0 edges, and 0 faces. 0 1 0 0.0 0.0 1 0.0 0.5 2 1.0 0.0 [3 4 5] A SimplicialComplex with 3 points, 0 edges, and 0 faces. 0 1 0 5.0 0.0 1 6.0 0.0 2 5.0 -0.6 """ from homology.dim0 import all_roots roots = self.stratum[0]['rep'].values.copy() all_roots(roots) for i in np.where(self.stratum[0]['pos'].values == True)[0]: yield np.where(roots == i)[0], PointCloud(self.coords.values[roots == i, :])
[docs] def make_pers1_rca1(self, cutoff=-1.0): r""" Run RCA1 and make a 1-dimensional `homology.PersDiag` for the edge-pairings for cycle generators. This reruns self.make_pers0(cutoff) again, to make sure components are marked correctly. Parameters ----------- cutoff: float Maximum edge height to use for RCA1 algorithm. Higher edges ignored. (Default: -1, meaning use all edges.) Returns ------- none. Produces `self.pers1` Table of edge pairs, similar to a persistence diagram. BUGS ---- data = np.array([[0.,0.],[1.,0.],[0.,1.],[1.,0.5]]) fails. Examples -------- >>> data = np.array([[0.,0.],[1.,0.],[0.,1.],[1.,1.]]) >>> pc = PointCloud(data, max_length=-1) >>> print(pc.stratum[1]) height pos rep bdy0 bdy1 0 1.000000 True 0 0 1 1 1.000000 True 1 0 2 2 1.000000 True 2 1 3 3 1.000000 True 3 2 3 4 1.414214 True 4 0 3 5 1.414214 True 5 1 2 >>> pc.make_pers1_rca1() >>> print(pc.pers1.diagram) birth_index death_index birth death pers 0 3 4 1.0 1.414214 0.414214 >>> data = np.array([[0.,0.],[1.,0.],[0.,1.],[1.,0.5]]) >>> pc = PointCloud(data, max_length=-1) >>> print(pc.stratum[1]) height pos rep bdy0 bdy1 0 0.500000 True 0 1 3 1 1.000000 True 1 0 1 2 1.000000 True 2 0 2 3 1.118034 True 3 0 3 4 1.118034 True 4 2 3 5 1.414214 True 5 1 2 >>> pc.make_pers1_rca1() >>> print(pc.pers1.diagram) Empty DataFrame Columns: [birth_index, death_index, birth, death, pers] Index: [] """ # we need 0dim persistence first. self.reset() self.make_pers0(cutoff=cutoff) column_list, column_edge_index, stop_edge = homology.dim1.rca1(self.stratum[1], cutoff=cutoff) assert len(column_list) == len(column_edge_index) pers_list = [(c[-1], column_edge_index[i]) for i, c in enumerate(column_list) if c] p = np.array(pers_list) if len(p)>0: mergetree = dict([]) # we can't compute a mergetree yet self.pers1 = homology.PersDiag( p[:, 0], p[:, 1], self.stratum[1]['height'].loc[p[:, 0]].values, self.stratum[1]['height'].loc[p[:, 1]].values, mergetree) else: # no births or deaths recorded self.pers1 = homology.PersDiag([], [], [], [], dict([])) pass
[docs]class PointCloud(SimplicialComplex): r""" PointCloud is a class for *embedded*, weighted simplicial complexes. This is a subclass of :class:`SimplicialComplex`, with the additional property that every 0-cell (vertex) is actually a point in :math:`\mathbb{R}^k.` The most basic and most common example of a PointCloud is an indexed set of :math:`N` points in :math:`\mathbb{R}^k` with heights assigned as 0, and mass assigned as 1. Typically, a user starts with 0-cells only. Then, any 1-cells, 2-cells, ..., are created later via some topological construction. """ def __init__(self, data_array, max_length=0.0, heights=None, masses=None, dist='euclidean', idx0=None, cache_type=None): r""" Construct a :class:`PointCloud` from a cloud of n points in :math:`\mathbb{R}^k.` Parameters ---------- data_array : :class:`numpy.ndarray` A np array with shape=(n,k), to use as the pointcloud. The array must have dtype=='float64'. max_length : float If max_length is positive, then find edges of length <= max_length. This uses the :class:`multidim.covertree.CoverTree` for efficiency. Default is 0.0, meaning compute no edges. heights : :class:`numpy.ndarray` If heights is given, it is used to assign graded values to the points in data_array. It must be a np array of dtype=='float64' and shape==(n,), like from np.apply_along_axis(my_func, 1, data_array) Default: None (all have height 0.0) masses : :class:`numpy.ndarray` If masses is given, it is used to assign mass values to the points in data_array. It must be a np array of dtype=='float64' and shape==(n,), like from np.apply_along_axis(my_func, 1, data_array) Default: None (all have mass 1.0) idx0 : :class:`numpy.ndarray` If idx0 is given, it is used to assign index values to the points in data_array. It must be a np array of dtype=='int64' and shape==(n,), Default: None (index by order given in data_array) cache_type : None or "np" or "dict" What type of distance cache to use. Often None is actually faster! If you really care about speed, remember to use -O dist : function If dist is given, it is used as the distance function for computing edge lengths, via scipy.spatial.distance.pdist. Not used with on-demand caching. Default: 'euclidean' """ assert data_array.dtype == np.float64, "Data must be float64." n, k = data_array.shape self.dimension = k self.cache_type = cache_type self.dist_cache = None self.dist = dist if self.cache_type is None: self.dist_cache = None elif self.cache_type == "np": self.dist_cache = np.eye(n, dtype=np.float64) - 1.0 elif self.cache_type == "dict": self.dist_cache = dict(((i,i), np.float64(0.0)) for i in range(n)) else: raise ValueError("cache_type can be None or 'dict' or 'np'") if heights is None: heights = np.zeros(n, dtype=np.float64) else: assert type(heights) == np.ndarray \ and heights.shape == (n,) \ and heights.dtype == 'float64', \ "Wrong type or size for heights data on pointcloud." hgt0 = heights if masses is None: masses = np.ones(n, dtype=np.float64) else: assert type(masses) == np.ndarray \ and masses.shape == (n,) \ and masses.dtype == 'float64', \ "Wrong type or size for heights data on pointcloud." mas0 = masses pos0 = np.ones(shape=(n,), dtype='bool') if idx0 is None: idx0 = np.arange(n, dtype='int64') else: assert type(idx0) == np.ndarray \ and idx0.shape == (n,) \ and idx0.dtype == 'int64', \ "Wrong type or size for indexing data on pointcloud." points = pd.DataFrame({ 'height': hgt0, 'mass': mas0, 'pos': pos0, 'rep': idx0, }, columns=['height', 'mass', 'pos', 'rep'], index=idx0) self.coords = pd.DataFrame(data_array, index=idx0) self.covertree = None edges = stratum_maker(1) super(self.__class__, self).__init__(stratum={0: points, 1: edges}) self.labels = np.zeros(shape=(self.coords.shape[0],), dtype=np.int64) self.source = np.zeros(shape=(self.coords.shape[0],), dtype=np.int64) self.label_info = pd.DataFrame(index=['black']) self.label_info['clouds'] = np.array([1], dtype=np.int64) self.label_info['points'] = np.array([n], dtype=np.int64) self.label_info['tot_mass'] = np.array([self.stratum[0]['mass'].sum()]) self.label_info['int_index'] = np.array([0], dtype=np.int64) self.max_length = max_length if self.max_length > 0.0 or self.max_length == -1.0: # use covertree to make all appropriate edges. from . import covertree self.covertree = covertree.CoverTree(self) bdy0 = [] bdy1 = [] hgts = [] for i, j, d in self.covertree.make_edges(max_distance=self.max_length): bdy0.append(min(i,j)) bdy1.append(max(i,j)) hgts.append(d) bdy0 = np.array(bdy0, dtype=np.int64) bdy1 = np.array(bdy1, dtype=np.int64) hgts = np.array(hgts) sortby = hgts.argsort() bdy0 = bdy0[sortby] bdy1 = bdy1[sortby] hgts = hgts[sortby] edges = pd.DataFrame({'height': hgts, 'pos': np.ones(shape=hgts.shape, dtype='bool'), 'rep': np.arange(hgts.shape[0], dtype=np.int64), 'bdy0': bdy0, 'bdy1': bdy1, }, columns=['height', 'pos', 'rep', 'bdy0', 'bdy1'], index=np.arange(hgts.shape[0], dtype=np.int64)) self.stratum[1] = edges
[docs] @classmethod def from_distances(cls, *args, **kwargs): r""" This method is not available for `PointCloud`, because actual coordinates are needed. Perhaps you want to use :func:`SimplicialComplex.from_distances` instead? """ raise NotImplementedError("This method does not inherit to PointCloud. Use the version from the parent class, SimplicialComplex.")
[docs] def plot(self, canvas, cutoff=-1, color='purple', pos_edges=False, edge_alpha=-1.0, size=1, twocells=False, title="SimplicialComplex", label=False): r""" Plot a PointCloud, decorated by various proeprties. Often slow! Parameters ---------- canvas : object An instance of `bokeh.plotting.figure.Figure` as in :code:`canvas = bokeh.plotting.figure()` or an instance of :class:`matplotlib.axes._subplots.AxesSubplot` as in :code:`axes,canvas = matplotlib.pyplot.subplots()` cutoff : float if cutoff>=0, only draw edges up to length <cutoff twocells : boolean draw 2-cells (triangles)? title : string title for plot label : boolean label points in plot? """ if type(canvas).__module__ == 'bokeh.plotting.figure': from bokeh.models import Range1d canvas_type = "bokeh" import bokeh.plotting elif type(canvas).__module__ == 'matplotlib.axes._subplots': canvas_type = "pyplot" import matplotlib.pyplot as plt else: raise NotImplementedError( "canvas must be a bokeh.plotting.figure() or a matplotlib.pyplot.subplots()[1]. You gave me {}".format( type(canvas)) ) n, k = self.coords.shape assert k == 2, "I can only plot in R^2. Maybe project your data first?" if canvas_type == "bokeh": canvas.title.text = title elif canvas_type == "pyplot": canvas.set_title(title) if twocells: raise NotImplementedError( "Have not re-incomporated 2-cells into RCA1 yet.") # find edges all_edges = self.stratum[1] if cutoff >= 0: all_edges = all_edges[all_edges['height'] < cutoff] if len(all_edges) > 0: minhgt = np.min(all_edges['height'].values) maxhgt = np.max(all_edges['height'].values) else: edge_alpha = 1.0 # plot positive edges, need to build structure for multi_line if pos_edges: pos = all_edges[all_edges['pos'] == True] val = pos['height'].values pt0 = self.coords.loc[pos['bdy0'].values].values pt1 = self.coords.loc[pos['bdy1'].values].values pts = np.hstack([pt0, pt1]) xs = pts[:, 0::2] ys = pts[:, 1::2] if canvas_type == "bokeh": canvas.multi_line(list(xs), list(ys), line_width=1, alpha=0.4, color='orange') elif canvas_type == "pyplot": for i in range(xs.shape[0]): if edge_alpha >= 0.0: this_edge_alpha = edge_alpha else: this_edge_alpha = 0.5 + 0.5*(val[i] - minval)/(maxval - minval) # should use Collections instead. canvas.plot(xs[i, :], ys[i, :], alpha=this_edge_alpha, color='orange') # plot negative edges, need to build structure for multi_line neg = all_edges[all_edges['pos'] == False] val = neg['height'].values pt0 = self.coords.loc[neg['bdy0'].values].values pt1 = self.coords.loc[neg['bdy1'].values].values pts = np.hstack([pt0, pt1]) xs = pts[:, 0::2] ys = pts[:, 1::2] if canvas_type == "bokeh": canvas.multi_line(list(xs), list(ys), line_width=1, alpha=0.6, color='blue') elif canvas_type == "pyplot": for i in range(xs.shape[0]): # should use Collections instead. if edge_alpha >= 0.0: this_edge_alpha = edge_alpha else: this_edge_alpha = 0.5 + 0.5*(val[i] - minval)/(maxval - minval) # should use Collections instead. canvas.plot(xs[i, :], ys[i, :], alpha=this_edge_alpha, color='blue') all_verts = self.stratum[0] # CURRENT UNIONFIND DOES NOT MARK NEG VERTS neg = all_verts[all_verts['pos'] == False] xs = list(self.coords.loc[neg.index, 0]) ys = list(self.coords.loc[neg.index, 1]) if canvas_type == "bokeh": canvas.circle(xs, ys, color='black', alpha=0.5, size=size) elif canvas_type == "pyplot": canvas.scatter(x=xs, y=ys, s=size, color='black', alpha=0.5) pos = all_verts[all_verts['pos'] == True] xs = self.coords.loc[pos.index, 0] ys = self.coords.loc[pos.index, 1] cs = list(self.label_info.index[self.labels[pos.index]]) if canvas_type == "bokeh": # fix the aspect ratio! xmid = (xs.max() + xs.min())/2.0 ymid = (ys.max() + ys.min())/2.0 span = max([xs.max() - xmid, xmid - xs.min(), ys.max() - ymid, ymid - ys.min()]) canvas.x_range = Range1d(xmid-span, xmid+span) canvas.y_range = Range1d(ymid-span, ymid+span) canvas.circle(list(xs), list(ys), color=cs, alpha=0.4, size=size) elif canvas_type == "pyplot": canvas.scatter(x=xs, y=ys, color=cs, alpha=0.4, s=size) if label: if canvas_type == "bokeh": canvas.text(xs, ys, list(map(str, list(pos.index.values)))) pass
[docs] def gaussian_fit(self, center=None): r""" Fit a normalized Gaussian to this cloud (using SVD). Parameters ---------- center If center is None (default), we find the best Gaussian with free mean. If center is given as an integer, use the point with that integer as the mean of the Gaussian. If center is given as a tuple or array, use that coordinate point as the mean of the Gaussian. Returns ------- (mean, sigma, rotation) for the Gaussian, suitable for `gaussian` """ return fast_algorithms.gaussian_fit(self.coords.values, center)
[docs] def cache_usage(self): r""" Compute the size of the distance cache. That is, what fraction of distances have we computed so far? """ n = self.coords.values.shape[0] n_choose_2 = 0.5*n*(n-1) if self.cache_type is None: return 0.0 elif self.cache_type == "np": computed = np.count_nonzero(self.dist_cache >= 0) return (computed - n)/n_choose_2 elif self.cache_type == "dict": computed = len(self.dist_cache) return (computed - n)/n_choose_2
[docs] def nearest_neighbors_slow(self, k): r""" Compute k nearest-neighbors of the PointCloud, by brute-force. Answers are cached in `self._nn[k]` Parameters ---------- k: int How many nearest neighbors to compute Returns ------- np array with dtype int and shape==(N,k+1). Entry [i,j] is the jth nearest neighbor of vertex i. Note that entry [i,0] == i, so [i,k] is the kth nearest neighbor. Notes ----- This method is intended for testing, and should only be used on small datasets. On a random example with 1,000 points in :math:`\mathbb{R}^2:` seeking `k=5` nearest neighbors, this method takes at least twice as long as :func:`nearest_neighbors`, and the discrepency is roughly quadratic. On 2,000 points, it is about 4 times slower. Examples -------- >>> pc = PointCloud(np.array([[ 0.58814682, 0.45405299], ... [ 0.09197879, 0.39721367], ... [ 0.29128654, 0.28372039], ... [ 0.14593167, 0.7027367 ], ... [ 0.77068438, 0.37849037], ... [ 0.17281855, 0.70204687], ... [ 0.48146217, 0.54619034], ... [ 0.27831744, 0.67327757], ... [ 0.49074255, 0.70847318], ... [ 0.132656, 0.0860524 ]])) >>> pc.nearest_neighbors_slow(3) array([[0, 6, 4, 8], [1, 2, 3, 9], [2, 1, 9, 6], [3, 5, 7, 1], [4, 0, 6, 8], [5, 3, 7, 1], [6, 0, 8, 7], [7, 5, 3, 8], [8, 6, 7, 0], [9, 2, 1, 6]]) See Also -------- :func:`multidim.PointCloud.nearest_neighbors` """ # simple cache if k in self._nn: return self._nn[k] num_points = self.coords.shape[0] self._nn[k] = np.ndarray(shape=(num_points, k+1), dtype=np.int64) all_points = self.coords.index.values dists = self.dists(all_points, all_points) self._nn[k] = dists.argsort(axis=1)[:, :k+1] # 0th entry is always self. return self._nn[k]
[docs] def nearest_neighbors(self, k): r""" Compute k nearest-neighbors of the PointCloud, using a clever CoverTree algorithm. Answers are cached in `self._nn[k]` Parameters ---------- k: int How many nearest neighbors to compute Returns ------- np array with dtype int and shape==(N,k+1). Entry [i,j] is the jth nearest neighbor of vertex i. Note that entry [i,0] == i, so [i,k] is the kth nearest neighbor. Examples -------- >>> pc = PointCloud(np.array([[ 0.58814682, 0.45405299], ... [ 0.09197879, 0.39721367], ... [ 0.29128654, 0.28372039], ... [ 0.14593167, 0.7027367 ], ... [ 0.77068438, 0.37849037], ... [ 0.17281855, 0.70204687], ... [ 0.48146217, 0.54619034], ... [ 0.27831744, 0.67327757], ... [ 0.49074255, 0.70847318], ... [ 0.132656, 0.0860524 ]])) >>> pc.nearest_neighbors(3) array([[0, 6, 4, 8], [1, 2, 3, 9], [2, 1, 9, 6], [3, 5, 7, 1], [4, 0, 6, 8], [5, 3, 7, 1], [6, 0, 8, 7], [7, 5, 3, 8], [8, 6, 7, 0], [9, 2, 1, 6]]) See Also -------- :func:`multidim.PointCloud.nearest_neighbors_slow` """ from . import covertree if self.covertree is None: self.covertree = covertree.CoverTree(self) # simple cache if k in self._nn: return self._nn[k] num_points = self.coords.shape[0] # -1 means "not found yet" self._nn[k] = -np.ones(shape=(num_points, k+1), dtype=np.int64) # Make sure we have the entire covertree. levels = [ level for level in self.covertree ] # run backwards: for level in reversed(levels): r = level.radius for ci in level.adults: for x in level.children[ci]: unknown_neighbors = np.where(self._nn[k][x] < 0)[0] if len(unknown_neighbors) > 0: to_find = unknown_neighbors[0] candidates = [] for cj in level.friends1[ci]: candidates.extend(level.children[cj]) candidates = np.array(candidates) num_found = min(k+1, len(candidates)) # don't bother computing lengths if there is nothing to # learn if num_found >= to_find: dists = fast_algorithms.distance_cache_None( np.array([x]), candidates, self.coords.values).flatten() order = dists.argsort() self._nn[k][x, to_find:num_found] = candidates[order][to_find:num_found] return self._nn[k]
# # all_points = self.coords.index.values # dists = self.dists(all_points, all_points) # self._nn[k] = dists.argsort(axis=1)[:, :k+1] # 0th entry is always self. # return self._nn[k]
[docs] def witnessed_barycenters(self, k): r""" Build the PointCloud of k-witnessed barycenters, weighted by distance-to-measure. This calls :func:`nearest_neighbors` with argument :code:`(k-1)`, which can be slow. Parameters ---------- k : int How many vertices for each witnessed barycenter. That is, use the (k-1) nearest neighbors, along with the vertex itself. Returns ------- pc : :class:`PointCloud` A pointcloud whose 0-cells are the witnessed barycenters, and whose 1-cells are the edges between those barycenters, all weighted by the notion of distance to a measure. """ n, d = self.coords.values.shape # First, look at the indices to uniqify polygons_indices = [tuple(np.sort(verts)) for verts in self.nearest_neighbors(k-1)] polygons_indices = np.array(list(set(polygons_indices))) p = polygons_indices.shape[0] assert p <= n # build the polygons from coordinates polygons = [] for points in polygons_indices: polygons.append(self.coords.values[points, :]) polygons = np.array(polygons) # Find the vertex barycenters assert polygons.shape == (p, k, d) polygons = polygons.swapaxes(0, 1) assert polygons.shape == (k, p, d) barycenters = polygons.mean(axis=0) assert barycenters.shape == (p, d) # compute weights diffs = polygons - barycenters assert diffs.shape == (k, p, d) norms = np.linalg.norm(diffs, axis=2)**2 assert norms.shape == (k, p) weights = -norms.sum(axis=0)/k assert weights.shape == (p,) pcbc = PointCloud(barycenters, heights=np.sqrt(-weights), dist=self.dist) pcbc.dists = squareform(pdist(pcbc.coords.values, pcbc.dist)) # make edges hgt = [] pos = [] idx = [] bdy0 = [] bdy1 = [] for e, (i, j) in enumerate(itertools.combinations(pcbc.stratum[0].index.values, 2)): idx.append(e) bdy0.append(i) bdy1.append(j) pos.append(True) mu = pcbc.dists[i, j] wi = weights[i] wj = weights[j] r = np.sqrt(mu**2*(mu**2 - 2*wi - 2*wj) + (wi - wj)**2)/2/mu hgt.append(r) edges = pd.DataFrame({ 'height': hgt, 'pos': pos, 'rep': idx, 'bdy0': bdy0, 'bdy1': bdy1, }, columns=['height', 'pos', 'rep', 'bdy0', 'bdy1'], index=idx) pcbc.stratum[1] = edges return pcbc
[docs] def unique_with_multiplicity(self): r""" Look for duplicate points, and mark their multiplicity. This sets self.multiplicity ToDo: Use Covertree. Examples -------- >>> a = np.array([[5.0, 2.0], [3.0, 4.0], [5.0, 2.0]]) >>> pc = PointCloud(a) >>> b, counts = pc.unique_with_multiplicity() >>> print(b) [[ 3. 4.] [ 5. 2.]] >>> print(counts) [1 2] """ coords = self.coords.values assert coords.shape[1] == 2,\ "This uniqifying method can use only 2-dim data" s = coords.shape assert coords.dtype == 'float64' coords.dtype = 'complex128' tmp_coords, tmp_index, tmp_inverse, tmp_counts = np.unique( coords.flatten(), return_index=True, return_inverse=True, return_counts=True) tmp_coords.dtype = 'float64' coords.dtype = 'float64' assert coords.shape == s n = tmp_coords.shape[0] d = coords.shape[1] assert n % d == 0 tmp_coords.shape = (n//d, d) return tmp_coords, tmp_counts
[docs] def dists(self, indices0, indices1): r""" Compute distances points indices0 and indices1. indices0 and indices1 must be 1-dimensional np arrays, so use not "5" but "np.array([5])" The return is a np array with shape == (len(indices0), len(indices1)) This uses the distance cache, depending on self.cache_type. You can query the size of the cache with :func:`cache_usage`. """ # Allow boolean selectors if indices0.dtype == np.uint8: indices0 = np.where(indices0)[0] if indices1.dtype == np.uint8: indices1 = np.where(indices1)[0] if self.cache_type is None: return fast_algorithms.distance_cache_None(indices0, indices1, self.coords.values) elif self.cache_type == "np": N = self.coords.values.shape[0] if self.dist_cache is None or self.dist_cache.shape != (N, N): self.dist_cache = np.eye(N, dtype=np.float64) - 1.0 return fast_algorithms.distance_cache_numpy(indices0, indices1, self.coords.values, self.dist_cache) elif self.cache_type == "dict": N = self.coords.values.shape[0] if self.dist_cache is None: self.dist_cache = dict(((i, i), np.float64(0.0)) for i in range(N)) return fast_algorithms.distance_cache_dict(indices0, indices1, self.coords.values, self.dist_cache) else: raise ValueError("cache_type can be None or 'dict' or 'np'")
[docs] def cover_ball(self, point_index=None): r""" Find a ball that covers the entire PointCloud. Parameters ---------- point_index : int If point_index is given, we use that point as the center. If point_index is None (default), then we compute the point neareast the center-of-mass, which requires an extra :func:`numpy.mean` and :func:`scipy.spatial.distance.cdist` call. Returns ------- ball : dict with keys 'index' (index of designated center point), 'point' (coordinates of designated center point), and 'radius' (radius of ball) """ if point_index is None: center = self.coords.values.mean(axis=0) center.shape = (1, center.shape[0]) # re-shape for cdist center_dists = cdist(self.coords.values, center, metric=self.dist) point_index = center_dists.argmin() point = np.array([point_index]) indices = self.coords.index.values point_dists = self.dists(point, indices).flatten() return {'index': point_index, 'point': self.coords.values[point_index, :], 'radius': point_dists.max()}
[docs] @classmethod def from_multisample_multilabel(cls, list_of_samples, list_of_labels, equal_priors=True, normalize_domain=False): r""" Produce a single labeled and weighted pointcloud from a list of samples and labels of those samples. Parameters ---------- list_of_samples : A list (or np array) of np arrays. Each such array is considered to be a sample of N points in R^d. N can vary between entries, but d cannot. list_of_labels : A list of labels. Labels can be anything, but it is covenient to use strings like "red" and "blue". list_of_labels[i] is the label for the points in list_of_samples[i]. equal_priors: Re-normalize weights so that each label is equally likely. Default: True normalize_domain: Use SVD/PCA to re-shape the original data to be roughy spherical. This should allow better learning via CDER. Default: False """ assert len(list_of_labels) == len(list_of_samples),\ "list_of_labels must equal list_of arrays. {} != {}".format( len(list_of_labels), len(list_of_samples)) ambient_dim = list_of_samples[0].shape[1] assert all([X.shape[1] == ambient_dim for X in list_of_samples]),\ "Dimension mismatch among list_of_samples!" label_info = pd.DataFrame(index=sorted(list(set(list(list_of_labels))))) # ['blue', 'green', 'red'] # label_names = list(label_dict.keys()) # label_index = np.array # num_labels = len(label_dict) # count how many times each label occurs. label_info['clouds'] = np.zeros(shape = (len(label_info),), dtype=np.int64) label_info['points'] = np.zeros(shape = (len(label_info),), dtype=np.int64) label_info['weight'] = np.zeros(shape = (len(label_info),), dtype=np.float64) for label in label_info.index: label_bool = np.array([l == label for l in list_of_labels]) label_info.loc[label, 'clouds'] = np.count_nonzero(label_bool) label_info['int_index'] = label_info.index.get_indexer(label_info.index) # merge the samples into one big dataset # expand the sample-wise labels to point-wise labels points = np.concatenate(list(list_of_samples)) if normalize_domain: m,s,v = fast_algorithms.gaussian_fit(points) points = np.dot((points - m), v.T)/s pointwise_labels = [] # keep track of labels, pointwise pointwise_source = [] pointwise_weight = [] for i, X in enumerate(list_of_samples): l = list_of_labels[i] num = X.shape[0] assert num > 0, "bad? {}".format(X.shape) pointwise_labels.extend([label_info['int_index'].loc[l]]*num) pointwise_source.extend([i]*num) if equal_priors: wt = 1.0/num/label_info.loc[l, 'clouds'] else: wt = 1.0 pointwise_weight.extend([wt]*num) label_info.loc[l, 'points'] = np.int64(label_info.loc[l, 'points']) + np.int64(num) label_info.loc[l, 'weight'] += wt*num pointwise_labels = np.array(pointwise_labels) pointwise_source = np.array(pointwise_source) pointwise_weight = np.array(pointwise_weight) pc = cls(points, masses=pointwise_weight) pc.label_info = label_info pc.labels = pointwise_labels pc.source = pointwise_source return pc