# coding=utf-8
r"""
The `multidim` class provides user-facing tools for topological data analysis
of multi-dimensional data.
The goal is to be honest about topology while also using speed/cleverness with
a minimal amount of user headache.
Included are:
- `PointCloud`, for data points in Euclidean space.
- `SimplicialComplex`, for abstract simplicial complexes, built
from `Simplex` objects sorted by dimension into `SimplexStratum` objects.
Copyright
---------
- This file is part of https://github.com/geomdata/gda-public/
- 2015, 2016, 2017 by Geometric Data Analytics, Inc. (http://geomdata.com)
- AGPL license. See `LICENSE` or https://github.com/geomdata/gda-public/blob/master/LICENSE
Examples
--------
>>> X = np.load("tests/circle.npy")
>>> pc = PointCloud(X, max_length=-1)
>>> pc
A SimplicialComplex with 1000 points, 499500 edges, and 0 faces.
>>> np.all(pc.stratum[0]['pos'].values == True)
True
>>> pc.check()
>>> pc.make_pers0(cutoff=0.15)
>>> for v in pc.cells(0):
... if v.positive:
... print(v)
0+ Simplex 0 of height 0.0 and mass 1.0
0+ Simplex 74 of height 0.0 and mass 1.0
0+ Simplex 183 of height 0.0 and mass 1.0
0+ Simplex 195 of height 0.0 and mass 1.0
0+ Simplex 197 of height 0.0 and mass 1.0
0+ Simplex 231 of height 0.0 and mass 1.0
0+ Simplex 354 of height 0.0 and mass 1.0
0+ Simplex 397 of height 0.0 and mass 1.0
0+ Simplex 489 of height 0.0 and mass 1.0
0+ Simplex 530 of height 0.0 and mass 1.0
0+ Simplex 607 of height 0.0 and mass 1.0
0+ Simplex 757 of height 0.0 and mass 1.0
0+ Simplex 781 of height 0.0 and mass 1.0
0+ Simplex 800 of height 0.0 and mass 1.0
0+ Simplex 903 of height 0.0 and mass 1.0
0+ Simplex 980 of height 0.0 and mass 1.0
>>> pc.pers0.grab(5)['keepcode']
birth_index death_index birth death pers
979 213 316 0.0 0.136923 0.136923
980 135 135 0.0 0.136992 0.136992
981 439 477 0.0 0.138059 0.138059
982 610 630 0.0 0.138474 0.138474
983 603 603 0.0 0.139332 0.139332
>>> pc.make_pers1_rca1(cutoff=0.2)
>>> pc.pers1.grab(5)['keepcode']
birth_index death_index birth death pers
221 3217 9700 0.095619 0.168120 0.072501
220 2942 9661 0.091542 0.167720 0.076177
219 2713 9279 0.087438 0.164152 0.076713
224 3333 10439 0.097564 0.174643 0.077079
200 1816 7688 0.071490 0.149336 0.077846
>>> V=pc.stratum[0]
>>> V.loc[:10]
height mass pos rep
0 0.0 1.0 True 0
1 0.0 1.0 False 0
2 0.0 1.0 False 1
3 0.0 1.0 False 0
4 0.0 1.0 False 0
5 0.0 1.0 False 4
6 0.0 1.0 False 1
7 0.0 1.0 False 0
8 0.0 1.0 False 0
9 0.0 1.0 False 0
10 0.0 1.0 False 1
>>> pc.cells(0)[0]
0+ Simplex 0 of height 0.0 and mass 1.0
>>> pc.cells(0)[2]
0- Simplex 2 of height 0.0 and mass 1.0
>>> E=pc.stratum[1]
>>> E.loc[:10]
height pos rep bdy0 bdy1
0 0.001142 False 0 858 866
1 0.001997 False 1 98 187
2 0.002471 False 2 251 313
3 0.002670 False 3 599 629
4 0.002766 False 4 150 167
5 0.003405 False 5 573 620
6 0.003812 False 6 474 517
7 0.005357 False 7 893 988
8 0.005533 False 8 623 644
9 0.005914 False 9 648 744
10 0.006056 False 10 612 640
>>> pc.cells(1)[2]
1- Simplex 2 of height 0.0024707293775457456 and mass None
"""
from __future__ import print_function
from collections import defaultdict
import itertools
import numpy as np
import pandas as pd
# Don't roll our own L2 norms
from scipy.spatial.distance import squareform, cdist, pdist, is_valid_dm
from . import fast_algorithms
import homology.dim0
import homology.dim1
[docs]class Simplex(object):
r"""
This class is a convenient container to access the data in the
pd DataFrame stratum[dim] of a SimplicialComplex.
It is always faster to access stratum[dim].loc[index] directly.
Parameters
----------
cellcomplex : :class:`SimplicialComplex`
The SimplicialComplex to which this Simplex belongs.
dim : int
The dimension in which this Simplex lives.
index : int
The abstract index of this Simplex.
Attributes
----------
cellcomplex : `SimplicialComplex`
The `SimplicialComplex` to which this Simplex belongs.
dim : int
The dimension in which this Simplex lives.
index : int
The abstract index of this Simplex.
shadow_complex : `SimplicialComplex`
children : :class:`pandas.Series`
See Also
--------
SimplicialComplex : A class for abstract simpicial cell complexes.
Notes
-----
A finite (*abstract*) *simplicial complex* is a finite set :math:`A` together
with collection :math:`\Delta` of subsets of :math:`A` such that if
:math:`X \in \Delta` and :math:`Y \subset X` then
:math:`Y \in \Delta`. [1]_
References
----------
.. [1] D. Feichtner-Kozlov, Combinatorial algebraic topology.
Berlin: Springer, 2008.
"""
def __init__(self, cellcomplex, dim, index):
self.cellcomplex = cellcomplex
self.dim = dim
self.index = index
self.shadow_complex = None
self.children = pd.Series(dtype=np.float64)
@property
def height(self):
r"""
:return: height (that is, filtered value) of this cell (np.float64)
"""
return self.cellcomplex.stratum[self.dim]['height'].loc[self.index]
@height.setter
def height(self, v):
self.cellcomplex.stratum[self.dim]['height'].loc[self.index] = v
@property
def mass(self):
r"""
:return: mass of this cell (np.float64 or None)
"""
if 'mass' in self.cellcomplex.stratum[self.dim]:
return self.cellcomplex.stratum[self.dim]['mass'].loc[self.index]
else:
return None
@mass.setter
def mass(self, v):
self.cellcomplex.stratum[self.dim]['mass'].loc[self.index] = v
@property
def positive(self):
r"""
:return:
"""
return self.cellcomplex.stratum[self.dim]['pos'].loc[self.index]
@positive.setter
def positive(self, b):
r"""
:param b:
"""
self.cellcomplex.stratum[self.dim]['pos'].loc[self.index] = b
@property
def representative(self):
r"""
:return:
"""
return self.cellcomplex.stratum[self.dim]['rep'].loc[self.index]
@representative.setter
def representative(self, r):
r"""
:param r:
"""
self.cellcomplex.stratum[self.dim]['rep'].loc[self.index] = r
@property
def boundary(self):
r"""
:return:
"""
parts = []
if self.dim > 0:
parts = range(self.dim + 1)
return {self.cellcomplex.stratum[self.dim]['bdy{}'.format(j)].loc[self.index] for j in parts}
@boundary.setter
def boundary(self, s):
r"""
:param s:
"""
for i, c in enumerate(sorted(list(s))):
self.cellcomplex.stratum[self.dim]['bdy{}'.format(i)].loc[
self.index] = c
def __hash__(self):
r"""
:param self:
:return:
"""
return self.index
def __eq__(self, other):
r"""
:param other:
:return:
"""
return self.cellcomplex == other.cellcomplex and self.__hash__() == other.__hash__()
def __repr__(self):
r"""
:return:
"""
sign = "+"
if not self.positive:
sign = "-"
return "{}{} Simplex {} of height {} and mass {}".format(self.dim, sign, self.index,
repr(self.height),
repr(self.mass))
def __lt__(self, other):
if not (self.cellcomplex == other.cellcomplex):
raise ValueError("These Cells are not in the same SimplicialComplex!")
if not (self.dim == other.dim):
raise ValueError("These Cells are not of the same dimension!")
return self.height() < other.height()
[docs]class SimplexStratum(object):
r""" SimplexStratum is a thin class for calling :class:`Simplex` objects of a certain
dimension from a `SimplicialComplex`. It is an interface to the data in
`SimplicialComplex.stratum`[dim], which is a `pandas.DataFrame`. Whenever
possible, the `pandas.DataFrame` should be called directly, for speed.
"""
def __init__(self, cell_complex, dim):
self.cell_complex = cell_complex
self.dim = dim
self._cells = dict()
def __getitem__(self, i):
if i not in self._cells:
self._cells[i] = Simplex(self.cell_complex, self.dim, i)
return self._cells[i]
def __iter__(self):
for i in self.cell_complex.stratum[self.dim].index:
yield self[i]
def __repr__(self):
return "Stratum {} of SimplicialComplex {}".format(self.dim,
id(self.cell_complex))
def __hash__(self):
return id(self.cell_complex), self.dim
def __eq__(self, other):
return type(self) == other(self) and self.__hash__ == other.__hash__
def stratum_maker(dim=0):
r"""
Make an empty stratum :class:`pandas.DataFrame` of the appropriate dimension.
This is used to initialize a new dimension of a :class:`SimplicialComplex`.
Parameters
----------
dim : int
Dimension of stratum (0, 1, 2, ...)
Returns
-------
DataFrame : :class:`pandas.DataFrame`
pd DataFrame suitable for SimplicialComplex.stratum[dim]
See Also
--------
:class:`SimplicialComplex` : A class for abstract simplicial cell complexes.
"""
bdy_size = 0
if dim > 0:
bdy_size = dim + 1
return pd.DataFrame({},
columns=['height', 'pos', 'rep'] + ['bdy{}'.format(i)
for i in
range(bdy_size)],
index=range(0))
def stratum_from_distances(dists, max_length=-1.0, points=None):
r""" Construct a stratum dictionary from a symmetric matrix of distances.
Parameters
----------
dists : :class:`numpy.ndarray`
A symmetric NxN array for distances, as obtained from
:class:`scipy.spatial.distances.squareform`
max_length : int
If max_length >=0, store only those edges
of length < max_length. Default: -1.0, store all edges.
points : :class:`pandas.DataFrame`
A fully-formed DataFrame of point information for stratum[0].
Returns
-------
{0: points, 1: edges} : dict
A stratum dictionary, suitable for SimplicialComplex objects.
See Also
--------
:func:`fast_algorithms.edges_from_dists`
"""
is_valid_dm(dists, throw=True)
if points is None:
n = dists.shape[0]
idx0 = np.arange(n, dtype=np.int64)
hgt0 = np.zeros(n, dtype=np.float64)
pos0 = np.ones(shape=(n,), dtype='bool')
points = pd.DataFrame({
'height': hgt0,
'pos': pos0,
'rep': idx0,
},
columns=['height', 'pos', 'rep'],
index=idx0)
if max_length == 0:
# if the cutoff is 0, we don't want to bother to make all the
# distances.
edges = stratum_maker(1)
else:
hgt1, pos1, bdys = fast_algorithms.edges_from_dists(points.index.values, dists,
cutoff=np.float64(max_length))
num_edges = hgt1.shape[0]
idx1 = np.arange(num_edges, dtype='int64')
edges = pd.DataFrame({
'height': hgt1,
'pos': pos1,
'rep': idx1,
'bdy0': bdys[:, 0],
'bdy1': bdys[:, 1],
},
columns=['height', 'pos', 'rep', 'bdy0', 'bdy1'],
index=idx1)
return {0: points, 1: edges}
def lower_star_for_image(img_array):
"""
Compute the lower star weighted simplicial complex from a 2d grid/image.
Parameters
----------
img_array, a `numpy.ndarray` of dimension 2.
Returns
-------
`homology.SimplicialComplex`
Examples
--------
>>> A = np.random.rand(3,4)
>>> lower_star_for_image(A)
A SimplicialComplex with 12 points, 23 edges, and 12 faces.
"""
assert len(img_array.shape) == 2,\
"Lower-star filtration is currently for images (2d arrays) only."
m = img_array.shape[0]
n = img_array.shape[1]
# make all vertices, by flattening val_array and indexing in the normal way
verts_hgt = img_array.flatten()
verts_rep = np.arange(m*n)
flat_index = verts_rep.reshape(m, n)
edges_hgt = []
edges_rep = []
edges_bdy0 = []
edges_bdy1 = []
# Make all the horizontal edges.
for i, j in itertools.product(range(m), range(n-1)):
# collect vertices' indices and heights
# left=(i,j) -- right=(i,j+1)
lf_idx = flat_index[i, j]
rt_idx = flat_index[i, j+1]
# There is no real reason for these asserts -- just clarification.
# assert lf_idx == n*(i) + (j)
# assert rt_idx == n*(i) + (j+1)
lf_hgt = img_array[i, j]
rt_hgt = img_array[i, j+1]
# There is no real reason for these asserts -- just clarification.
# assert lf_hgt == verts_hgt[lf_idx]
# assert rt_hgt == verts_hgt[rt_idx]
edges_hgt.append(np.max([lf_hgt, rt_hgt]))
edges_rep.append(len(edges_rep))
edges_bdy0.append(lf_idx)
edges_bdy1.append(rt_idx)
# This i,j horizontal edge should have index (n-1)*i + j
assert len(edges_hgt) - 1 == (n-1)*i + j
# did we count all horizontal edges?
assert len(edges_hgt) == (n-1)*m
# Make all the vertical edges
for i, j in itertools.product(range(m-1), range(n)):
# collect vertices' indices and heights
# top=(i,j)
# |
# bot=(i+1,j)
tp_idx = flat_index[i, j]
bt_idx = flat_index[i+1, j]
# There is no real reason for these asserts -- just clarification.
# assert tp_idx == n*(i) + (j)
# assert bt_idx == n*(i+1) + (j)
tp_hgt = img_array[i, j]
bt_hgt = img_array[i+1, j]
# There is no real reason for these asserts -- just clarification.
# assert tp_hgt == verts_hgt[tp_idx]
# assert bt_hgt == verts_hgt[bt_idx]
edges_hgt.append(np.max([tp_hgt, bt_hgt]))
edges_rep.append(len(edges_rep))
edges_bdy0.append(tp_idx)
edges_bdy1.append(bt_idx)
# This i,j vertical edge should have index n*i + j
# AFTER the (n-1)*m horizontal edges
assert len(edges_hgt) - 1 == (n-1)*m + n*i + j
# did we cound all vertical AND horizontal edges?
assert len(edges_hgt) == (n-1)*m + n*(m-1)
faces_hgt = []
faces_rep = []
faces_bdy0 = []
faces_bdy1 = []
faces_bdy2 = []
# Make the diagonal edges, and the faces, too.
for i, j in itertools.product(range(m-1), range(n-1)):
# collect the vertices' indices and heights
# nw=(i,j) ne=(i, j+1)
# at (i,j)
# sw=(i+1, j) se=(i+1, j+1)
nw_idx = flat_index[i, j]
ne_idx = flat_index[i, j+1]
se_idx = flat_index[i+1, j+1]
sw_idx = flat_index[i+1, j]
# There is no real reason for these asserts -- just clarification.
# assert nw_idx == n*(i) + (j)
# assert ne_idx == n*(i) + (j+1)
# assert se_idx == n*(i+1) + (j+1)
# assert sw_idx == n*(i+1) + (j)
nw_hgt = img_array[i, j]
ne_hgt = img_array[i, j+1]
se_hgt = img_array[i+1, j+1]
sw_hgt = img_array[i+1, j]
# There is no real reason for these asserts -- just clarification.
# assert nw_hgt == verts_hgt[nw_idx]
# assert ne_hgt == verts_hgt[ne_idx]
# assert se_hgt == verts_hgt[se_idx]
# assert sw_hgt == verts_hgt[sw_idx]
# determine diagonal
cell_max_loc = np.argmax([nw_hgt, ne_hgt, se_hgt, sw_hgt])
if cell_max_loc % 2 == 0:
# Max is either nw or se.
# Make edge (nw,se)
edges_hgt.append(np.max([nw_hgt, se_hgt]))
edges_rep.append(len(edges_rep))
edges_bdy0.append(nw_idx)
edges_bdy1.append(se_idx)
# Make face (nw,ne,se).
faces_hgt.append(np.max([nw_hgt, ne_hgt, se_hgt]))
faces_rep.append(len(faces_rep))
faces_bdy0.append((n-1)*i + j) # horizontal nw-ne
# assert edges_bdy0[ (n-1)*i + j ] == nw_idx
# assert edges_bdy1[ (n-1)*i + j ] == ne_idx
faces_bdy1.append((n-1)*m + n*i + j+1) # vertical ne|se
# assert edges_bdy0[ (n-1)*m + n*i + j+1 ] == ne_idx
# assert edges_bdy1[ (n-1)*m + n*i + j+1 ] == se_idx
faces_bdy2.append(edges_rep[-1]) # most recent edge is nw\se
# assert edges_bdy0[ edges_rep[-1] ] == nw_idx
# assert edges_bdy1[ edges_rep[-1] ] == se_idx
# Make face (sw,se,nw).
faces_hgt.append(np.max([sw_hgt, se_hgt, nw_hgt]))
faces_rep.append(len(faces_rep))
faces_bdy0.append((n-1)*(i+1) + j) # horizontal sw-se
# assert edges_bdy0[ (n-1)*(i+1) + j ] == sw_idx
# assert edges_bdy1[ (n-1)*(i+1) + j ] == se_idx
faces_bdy1.append((n-1)*m + n*i + j) # vertical nw|sw
# assert edges_bdy0[ (n-1)*m + n*i + j ] == nw_idx
# assert edges_bdy1[ (n-1)*m + n*i + j ] == sw_idx
faces_bdy2.append(edges_rep[-1]) # most recent edge is nw\se
# assert edges_bdy0[ edges_rep[-1] ] == nw_idx
# assert edges_bdy1[ edges_rep[-1] ] == se_idx
else:
# Max is either ne or sw.
# Make edge (ne,sw)
edges_hgt.append(np.max([ne_hgt, sw_hgt]))
edges_rep.append(len(edges_rep))
edges_bdy0.append(ne_idx)
edges_bdy1.append(sw_idx)
# Make face (nw,ne,sw).
faces_hgt.append(np.max([nw_hgt, ne_hgt, sw_hgt]))
faces_rep.append(len(faces_rep))
faces_bdy0.append((n-1)*i + j) # horizontal nw-ne
# assert edges_bdy0[ (n-1)*i + j ] == nw_idx
# assert edges_bdy1[ (n-1)*i + j ] == ne_idx
faces_bdy1.append((n-1)*m + n*i + j) # vertical nw|sw
# assert edges_bdy0[ (n-1)*m + n*i + j ] == nw_idx
# assert edges_bdy1[ (n-1)*m + n*i + j ] == sw_idx
faces_bdy2.append(edges_rep[-1]) # most recent edge is ne\sw
# assert edges_bdy0[ edges_rep[-1] ] == ne_idx
# assert edges_bdy1[ edges_rep[-1] ] == sw_idx
# Make face (sw,se,ne).
faces_hgt.append(np.max([sw_hgt, se_hgt, ne_hgt]))
faces_rep.append(len(faces_rep))
faces_bdy0.append((n-1)*(i+1) + j) # horizontal sw-se
# assert edges_bdy0[ (n-1)*(i+1) + j ] == sw_idx
# assert edges_bdy1[ (n-1)*(i+1) + j ] == se_idx
faces_bdy1.append((n-1)*m + n*i + j+1) # vertical ne|se
# assert edges_bdy0[ (n-1)*m + n*i + j+1 ] == ne_idx
# assert edges_bdy1[ (n-1)*m + n*i + j+1 ] == se_idx
faces_bdy2.append(edges_rep[-1]) # most recent edge is ne\sw
# assert edges_bdy0[ edges_rep[-1] ] == ne_idx
# assert edges_bdy1[ edges_rep[-1] ] == sw_idx
verts_pos = np.ones_like(verts_hgt, dtype='bool')
edges_pos = np.ones_like(edges_rep, dtype='bool')
faces_pos = np.ones_like(faces_rep, dtype='bool')
verts = pd.DataFrame({'height': verts_hgt,
'rep': verts_rep,
'pos': verts_pos},
columns=['height', 'pos', 'rep'])
edges = pd.DataFrame({'height': edges_hgt,
'rep': edges_rep,
'pos': edges_pos,
'bdy0': edges_bdy0,
'bdy1': edges_bdy1},
columns=['height', 'pos', 'rep', 'bdy0', 'bdy1'])
faces = pd.DataFrame({'height': faces_hgt,
'rep': faces_rep,
'pos': faces_pos,
'bdy0': faces_bdy0,
'bdy1': faces_bdy1,
'bdy2': faces_bdy2},
columns=['height', 'pos', 'rep', 'bdy0', 'bdy1', 'bdy2'])
return SimplicialComplex(stratum={0: verts, 1: edges, 2: faces})
[docs]class SimplicialComplex(object):
r"""
A class for abstract *weighted* simplicial complexes.
A SimplicialComplex is built from 0-cells (vertices), 1-cells (edges),
2-cells (faces), and so on.
Each cell knows its boundary. A 0-cell has no boundary. A 1-cell has two
0-cells as its boundary. A 2-cell has three 1-cells as its boundary, and
so on.
Each cell *must* height value, called `height`. These heights are used in
several topological algorithms that depend on filtering.
Each cell *may* have a mass value, called `mass`. These masses are used in
some data-analysis methods that involve weighted averaging or probability.
Each cell can
A SimplicialComplex has no notion of coordinates or embedding. For that,
use the :class:`PointCloud` class, which inherits all methods from
SimplicialComplex but also adds coordinate-dependent methods.
Parameters
----------
stratum : dict
Dictionary of :class:`pandas.DataFrame` objects holding vertices,
edges, faces, and so on. See examples below.
Notes
-----
One can reference the :class:`Simplex` objects that are separated by
dimension into :class:`SimplexStratum` objects.
Each :class:`Simplex` object has a height, so these are *filtered*
simplicial complexes.
Each :class:`Simplex` object may have a mass, so these can be are *weighted*
simplicial complexes.
The **rep** and **pos** attributes are used when computing various
homologies, such as the Rips or Cech complexes.
Whenever possible, computations are done on :class:`numpy.ndarray` arrays
in compiled code, so they are usually quite fast.
Examples
--------
For example, a formal triangle could be built this way:
>>> vertices = pd.DataFrame({'height':[ 0.0, 0.0, 0.0],
... 'mass': [1.0, 1.0, 1.0],
... 'pos': [True, True, True],
... 'rep' : [0, 1, 2]})
>>> edges = pd.DataFrame({'height':[ 0.0, 0.0, 0.0],
... 'pos': [True, True, True],
... 'rep' : [0, 1, 2],
... 'bdy0': [0, 1, 2],
... 'bdy1': [1, 2, 0]})
>>> faces = pd.DataFrame({'height': [0.0],
... 'pos': [True],
... 'rep': [0],
... 'bdy0': [0],
... 'bdy1': [1],
... 'bdy2': [2]})
>>> T = SimplicialComplex(stratum = {0: vertices, 1: edges, 2: faces})
>>> print(T)
A SimplicialComplex with 3 points, 3 edges, and 1 faces.
"""
def __init__(self, stratum=None):
if stratum is not None:
assert all(type(k) == int and k >= 0 for k in stratum.keys()), \
"The strata of a SimplicialComplex must be indexed by non-negative integers."
self.stratum = defaultdict(stratum_maker, stratum)
else:
self.stratum = defaultdict(stratum_maker)
self._nn = dict()
self._cellstratum = dict()
self.pers0 = None
self.pers1 = None
[docs] @classmethod
def from_distances(cls, dists, max_length=-1.0, points=None):
r"""
Construct a `SimplicialComplex` from a symmetric matrix of distances.
Parameters
----------
dists : `numpy.ndarray`
An N-by-N symmetric array, with 0s on the diagonal,
as obtained from :func:`scipy.spatial.distances.squareform`
max_length : float
If :code:`max_length >= 0`, store only those edges of length less
than :code:`max_length`. Default: -1, store all edges.
points : `pandas.DataFrame`
A fully-formed DataFrame of point information for
stratum[0]. But, if you have that info, you probably want to use
`PointCloud` instead.
Returns
-------
`SimplicialComplex`
"""
stratum = stratum_from_distances(dists, max_length, points)
return cls(stratum=stratum)
[docs] def check(self):
r"""Run consistency checks on all simplices in all dimensions.
raises ValueError if anything is wrong.
"""
for dim in self.stratum.keys():
if dim > 0:
valcheck = fast_algorithms.check_heights(self, dim)
if not valcheck == -1:
raise ValueError("Not a filtration! Check 'height' in ({}).stratum[{}].iloc[{}]".format(self, dim, valcheck))
def __repr__(self):
return "A SimplicialComplex with {} points, {} edges, and {} faces.".format(
len(self.stratum[0]), len(self.stratum[1]),
len(self.stratum[2]))
[docs] def cells(self, dim):
r""" iterate over all :class:`Simplex` objects of dimension dim.
This is generated on-demand from the :class:`SimplexStratum`.
"""
if dim not in self._cellstratum:
self._cellstratum[dim] = SimplexStratum(self, dim)
return self._cellstratum[dim]
[docs] def reset(self):
""" delete persistence diagrams, and forget all representative and
positivity information. Use this before re-running :func:`make_pers0`
or other methods in `homology` with a smaller cutoff.
"""
self.pers0 = None
self.pers1 = None
for dim in self.stratum.keys():
if self.stratum[dim] is None:
self.stratum[dim] = stratum_maker(dim)
self.stratum[dim]['rep'].values[:] = self.stratum[
dim].index.values # identity representation
self.stratum[dim]['pos'].values[:] = True
pass
[docs] def make_pers0(self, cutoff=-1.0):
r"""Run the UnionFind algorithm to mark connected components of the
SimplicialComplex. This marks points as positive/negative.
It also marks the reprensetatives of points.
It makes a PersDiag object with unionfind, saved as :code:`self.pers0`
"""
if (1 not in self.stratum.keys()) or len(self.stratum[1]) == 0:
raise ValueError("This SimplicialComplex has no 1-stratum (edges). Persistence is meaningless.")
try:
if self.max_length >= 0.0 and cutoff > self.max_length:
raise ValueError("Persistence cutoff is greater than max_length of pre-computed edges. This is meaningless.")
except AttributeError:
pass
tbirth_index, tdeath_index, ybirth_index, ydeath_index, mergetree = homology.dim0.unionfind(self, np.float64(cutoff))
self.pers0 = homology.PersDiag(tbirth_index, tdeath_index, ybirth_index, ydeath_index, mergetree)
pass
def sever(self):
r"""
Subdivide a SimplicialComplex or PointCloud into several smaller
partitions, using the known 0-dimensional persistence diagram. This is
an iterator (it _yields_ the terms).
Two points end up in the same partition if and only if they are
connected by a sequence of edges of length < cutoff.
Yields
------
pairs (indices, subpointcloud) of persistently connected
SimplicialComplexes/PointClouds.
The first term gives indices of the these points from the original `PointCloud`
The second term gives a new `PointCloud` with its own sequential index.
Notes
-----
This uses the 0-dimensional Persistence Diagram; therefore, you should
run `self.reset()` and `self.make_pers0(cutoff)` first.
See Also
--------
:func:`make_pers0` :func:`reset`
Examples
--------
>>> pc = PointCloud(np.array([[0.,0.],[0.,0.5],[1.,0.],[5.,0.],[6.,0.],[5.,-0.6]]), max_length=-1.0)
>>> pc.make_pers0(cutoff=1.9)
>>> for indices,sub_pc in pc.sever():
... print(indices)
... print(sub_pc)
... print(sub_pc.coords)
[0 1 2]
A SimplicialComplex with 3 points, 0 edges, and 0 faces.
0 1
0 0.0 0.0
1 0.0 0.5
2 1.0 0.0
[3 4 5]
A SimplicialComplex with 3 points, 0 edges, and 0 faces.
0 1
0 5.0 0.0
1 6.0 0.0
2 5.0 -0.6
"""
from homology.dim0 import all_roots
roots = self.stratum[0]['rep'].values.copy()
all_roots(roots)
for i in np.where(self.stratum[0]['pos'].values == True)[0]:
yield np.where(roots == i)[0], PointCloud(self.coords.values[roots == i, :])
[docs] def make_pers1_rca1(self, cutoff=-1.0):
r""" Run RCA1 and make a 1-dimensional `homology.PersDiag` for the
edge-pairings for cycle generators.
This reruns self.make_pers0(cutoff) again, to make sure components are
marked correctly.
Parameters
-----------
cutoff: float
Maximum edge height to use for RCA1 algorithm. Higher edges ignored.
(Default: -1, meaning use all edges.)
Returns
-------
none. Produces `self.pers1`
Table of edge pairs, similar to a persistence diagram.
BUGS
----
data = np.array([[0.,0.],[1.,0.],[0.,1.],[1.,0.5]]) fails.
Examples
--------
>>> data = np.array([[0.,0.],[1.,0.],[0.,1.],[1.,1.]])
>>> pc = PointCloud(data, max_length=-1)
>>> print(pc.stratum[1])
height pos rep bdy0 bdy1
0 1.000000 True 0 0 1
1 1.000000 True 1 0 2
2 1.000000 True 2 1 3
3 1.000000 True 3 2 3
4 1.414214 True 4 0 3
5 1.414214 True 5 1 2
>>> pc.make_pers1_rca1()
>>> print(pc.pers1.diagram)
birth_index death_index birth death pers
0 3 4 1.0 1.414214 0.414214
>>> data = np.array([[0.,0.],[1.,0.],[0.,1.],[1.,0.5]])
>>> pc = PointCloud(data, max_length=-1)
>>> print(pc.stratum[1])
height pos rep bdy0 bdy1
0 0.500000 True 0 1 3
1 1.000000 True 1 0 1
2 1.000000 True 2 0 2
3 1.118034 True 3 0 3
4 1.118034 True 4 2 3
5 1.414214 True 5 1 2
>>> pc.make_pers1_rca1()
>>> print(pc.pers1.diagram)
Empty DataFrame
Columns: [birth_index, death_index, birth, death, pers]
Index: []
"""
# we need 0dim persistence first.
self.reset()
self.make_pers0(cutoff=cutoff)
column_list, column_edge_index, stop_edge = homology.dim1.rca1(self.stratum[1], cutoff=cutoff)
assert len(column_list) == len(column_edge_index)
pers_list = [(c[-1], column_edge_index[i]) for i, c in
enumerate(column_list) if c]
p = np.array(pers_list)
if len(p)>0:
mergetree = dict([]) # we can't compute a mergetree yet
self.pers1 = homology.PersDiag(
p[:, 0],
p[:, 1],
self.stratum[1]['height'].loc[p[:, 0]].values,
self.stratum[1]['height'].loc[p[:, 1]].values,
mergetree)
else:
# no births or deaths recorded
self.pers1 = homology.PersDiag([], [], [], [], dict([]))
pass
[docs]class PointCloud(SimplicialComplex):
r""" PointCloud is a class for *embedded*, weighted simplicial complexes.
This is a subclass of :class:`SimplicialComplex`, with the additional property
that every 0-cell (vertex) is actually a point in :math:`\mathbb{R}^k.`
The most basic and most common example of a PointCloud is an indexed set of
:math:`N` points in :math:`\mathbb{R}^k` with heights assigned as 0, and
mass assigned as 1.
Typically, a user starts with 0-cells only. Then, any 1-cells, 2-cells,
..., are created later via some topological construction.
"""
def __init__(self, data_array, max_length=0.0, heights=None, masses=None,
dist='euclidean', idx0=None, cache_type=None):
r""" Construct a :class:`PointCloud` from a cloud of n points in
:math:`\mathbb{R}^k.`
Parameters
----------
data_array : :class:`numpy.ndarray`
A np array with shape=(n,k), to use as the pointcloud. The array
must have dtype=='float64'.
max_length : float
If max_length is positive, then find edges of length <= max_length.
This uses the :class:`multidim.covertree.CoverTree` for efficiency.
Default is 0.0, meaning compute no edges.
heights : :class:`numpy.ndarray`
If heights is given, it is used to assign graded values to the
points in data_array. It must be a np array of dtype=='float64' and
shape==(n,), like from np.apply_along_axis(my_func, 1, data_array)
Default: None (all have height 0.0)
masses : :class:`numpy.ndarray`
If masses is given, it is used to assign mass values to the
points in data_array. It must be a np array of dtype=='float64' and
shape==(n,), like from np.apply_along_axis(my_func, 1, data_array)
Default: None (all have mass 1.0)
idx0 : :class:`numpy.ndarray`
If idx0 is given, it is used to assign index values to the
points in data_array. It must be a np array of dtype=='int64' and
shape==(n,),
Default: None (index by order given in data_array)
cache_type : None or "np" or "dict"
What type of distance cache to use. Often None is actually faster!
If you really care about speed, remember to use -O
dist : function
If dist is given, it is used as the distance function for computing
edge lengths, via scipy.spatial.distance.pdist. Not used with on-demand caching.
Default: 'euclidean'
"""
assert data_array.dtype == np.float64, "Data must be float64."
n, k = data_array.shape
self.dimension = k
self.cache_type = cache_type
self.dist_cache = None
self.dist = dist
if self.cache_type is None:
self.dist_cache = None
elif self.cache_type == "np":
self.dist_cache = np.eye(n, dtype=np.float64) - 1.0
elif self.cache_type == "dict":
self.dist_cache = dict(((i,i), np.float64(0.0)) for i in range(n))
else:
raise ValueError("cache_type can be None or 'dict' or 'np'")
if heights is None:
heights = np.zeros(n, dtype=np.float64)
else:
assert type(heights) == np.ndarray \
and heights.shape == (n,) \
and heights.dtype == 'float64', \
"Wrong type or size for heights data on pointcloud."
hgt0 = heights
if masses is None:
masses = np.ones(n, dtype=np.float64)
else:
assert type(masses) == np.ndarray \
and masses.shape == (n,) \
and masses.dtype == 'float64', \
"Wrong type or size for heights data on pointcloud."
mas0 = masses
pos0 = np.ones(shape=(n,), dtype='bool')
if idx0 is None:
idx0 = np.arange(n, dtype='int64')
else:
assert type(idx0) == np.ndarray \
and idx0.shape == (n,) \
and idx0.dtype == 'int64', \
"Wrong type or size for indexing data on pointcloud."
points = pd.DataFrame({
'height': hgt0,
'mass': mas0,
'pos': pos0,
'rep': idx0,
},
columns=['height', 'mass', 'pos', 'rep'],
index=idx0)
self.coords = pd.DataFrame(data_array, index=idx0)
self.covertree = None
edges = stratum_maker(1)
super(self.__class__, self).__init__(stratum={0: points, 1: edges})
self.labels = np.zeros(shape=(self.coords.shape[0],), dtype=np.int64)
self.source = np.zeros(shape=(self.coords.shape[0],), dtype=np.int64)
self.label_info = pd.DataFrame(index=['black'])
self.label_info['clouds'] = np.array([1], dtype=np.int64)
self.label_info['points'] = np.array([n], dtype=np.int64)
self.label_info['tot_mass'] = np.array([self.stratum[0]['mass'].sum()])
self.label_info['int_index'] = np.array([0], dtype=np.int64)
self.max_length = max_length
if self.max_length > 0.0 or self.max_length == -1.0:
# use covertree to make all appropriate edges.
from . import covertree
self.covertree = covertree.CoverTree(self)
bdy0 = []
bdy1 = []
hgts = []
for i, j, d in self.covertree.make_edges(max_distance=self.max_length):
bdy0.append(min(i,j))
bdy1.append(max(i,j))
hgts.append(d)
bdy0 = np.array(bdy0, dtype=np.int64)
bdy1 = np.array(bdy1, dtype=np.int64)
hgts = np.array(hgts)
sortby = hgts.argsort()
bdy0 = bdy0[sortby]
bdy1 = bdy1[sortby]
hgts = hgts[sortby]
edges = pd.DataFrame({'height': hgts,
'pos': np.ones(shape=hgts.shape, dtype='bool'),
'rep': np.arange(hgts.shape[0], dtype=np.int64),
'bdy0': bdy0, 'bdy1': bdy1, },
columns=['height', 'pos', 'rep', 'bdy0', 'bdy1'],
index=np.arange(hgts.shape[0], dtype=np.int64))
self.stratum[1] = edges
[docs] @classmethod
def from_distances(cls, *args, **kwargs):
r"""
This method is not available for `PointCloud`, because actual
coordinates are needed. Perhaps you want to use
:func:`SimplicialComplex.from_distances` instead?
"""
raise NotImplementedError("This method does not inherit to PointCloud. Use the version from the parent class, SimplicialComplex.")
[docs] def plot(self, canvas, cutoff=-1, color='purple', pos_edges=False,
edge_alpha=-1.0, size=1,
twocells=False, title="SimplicialComplex", label=False):
r"""
Plot a PointCloud, decorated by various proeprties.
Often slow!
Parameters
----------
canvas : object
An instance of
`bokeh.plotting.figure.Figure` as in
:code:`canvas = bokeh.plotting.figure()`
or an instance of :class:`matplotlib.axes._subplots.AxesSubplot` as
in :code:`axes,canvas = matplotlib.pyplot.subplots()`
cutoff : float
if cutoff>=0, only draw edges up to length <cutoff
twocells : boolean
draw 2-cells (triangles)?
title : string
title for plot
label : boolean
label points in plot?
"""
if type(canvas).__module__ == 'bokeh.plotting.figure':
from bokeh.models import Range1d
canvas_type = "bokeh"
import bokeh.plotting
elif type(canvas).__module__ == 'matplotlib.axes._subplots':
canvas_type = "pyplot"
import matplotlib.pyplot as plt
else:
raise NotImplementedError(
"canvas must be a bokeh.plotting.figure() or a matplotlib.pyplot.subplots()[1]. You gave me {}".format(
type(canvas))
)
n, k = self.coords.shape
assert k == 2, "I can only plot in R^2. Maybe project your data first?"
if canvas_type == "bokeh":
canvas.title.text = title
elif canvas_type == "pyplot":
canvas.set_title(title)
if twocells:
raise NotImplementedError(
"Have not re-incomporated 2-cells into RCA1 yet.")
# find edges
all_edges = self.stratum[1]
if cutoff >= 0:
all_edges = all_edges[all_edges['height'] < cutoff]
if len(all_edges) > 0:
minhgt = np.min(all_edges['height'].values)
maxhgt = np.max(all_edges['height'].values)
else:
edge_alpha = 1.0
# plot positive edges, need to build structure for multi_line
if pos_edges:
pos = all_edges[all_edges['pos'] == True]
val = pos['height'].values
pt0 = self.coords.loc[pos['bdy0'].values].values
pt1 = self.coords.loc[pos['bdy1'].values].values
pts = np.hstack([pt0, pt1])
xs = pts[:, 0::2]
ys = pts[:, 1::2]
if canvas_type == "bokeh":
canvas.multi_line(list(xs),
list(ys),
line_width=1, alpha=0.4, color='orange')
elif canvas_type == "pyplot":
for i in range(xs.shape[0]):
if edge_alpha >= 0.0:
this_edge_alpha = edge_alpha
else:
this_edge_alpha = 0.5 + 0.5*(val[i] - minval)/(maxval - minval)
# should use Collections instead.
canvas.plot(xs[i, :], ys[i, :],
alpha=this_edge_alpha, color='orange')
# plot negative edges, need to build structure for multi_line
neg = all_edges[all_edges['pos'] == False]
val = neg['height'].values
pt0 = self.coords.loc[neg['bdy0'].values].values
pt1 = self.coords.loc[neg['bdy1'].values].values
pts = np.hstack([pt0, pt1])
xs = pts[:, 0::2]
ys = pts[:, 1::2]
if canvas_type == "bokeh":
canvas.multi_line(list(xs),
list(ys),
line_width=1, alpha=0.6, color='blue')
elif canvas_type == "pyplot":
for i in range(xs.shape[0]):
# should use Collections instead.
if edge_alpha >= 0.0:
this_edge_alpha = edge_alpha
else:
this_edge_alpha = 0.5 + 0.5*(val[i] - minval)/(maxval - minval)
# should use Collections instead.
canvas.plot(xs[i, :], ys[i, :],
alpha=this_edge_alpha, color='blue')
all_verts = self.stratum[0]
# CURRENT UNIONFIND DOES NOT MARK NEG VERTS
neg = all_verts[all_verts['pos'] == False]
xs = list(self.coords.loc[neg.index, 0])
ys = list(self.coords.loc[neg.index, 1])
if canvas_type == "bokeh":
canvas.circle(xs, ys, color='black', alpha=0.5, size=size)
elif canvas_type == "pyplot":
canvas.scatter(x=xs, y=ys, s=size, color='black', alpha=0.5)
pos = all_verts[all_verts['pos'] == True]
xs = self.coords.loc[pos.index, 0]
ys = self.coords.loc[pos.index, 1]
cs = list(self.label_info.index[self.labels[pos.index]])
if canvas_type == "bokeh":
# fix the aspect ratio!
xmid = (xs.max() + xs.min())/2.0
ymid = (ys.max() + ys.min())/2.0
span = max([xs.max() - xmid,
xmid - xs.min(),
ys.max() - ymid,
ymid - ys.min()])
canvas.x_range = Range1d(xmid-span, xmid+span)
canvas.y_range = Range1d(ymid-span, ymid+span)
canvas.circle(list(xs), list(ys), color=cs, alpha=0.4, size=size)
elif canvas_type == "pyplot":
canvas.scatter(x=xs, y=ys, color=cs, alpha=0.4, s=size)
if label:
if canvas_type == "bokeh":
canvas.text(xs, ys, list(map(str, list(pos.index.values))))
pass
[docs] def gaussian_fit(self, center=None):
r"""
Fit a normalized Gaussian to this cloud (using SVD).
Parameters
----------
center
If center is None (default), we find the best Gaussian with free mean.
If center is given as an integer, use the point with that integer as the
mean of the Gaussian.
If center is given as a tuple or array, use that coordinate point as
the mean of the Gaussian.
Returns
-------
(mean, sigma, rotation) for the Gaussian, suitable for `gaussian`
"""
return fast_algorithms.gaussian_fit(self.coords.values, center)
[docs] def cache_usage(self):
r""" Compute the size of the distance cache. That is, what
fraction of distances have we computed so far? """
n = self.coords.values.shape[0]
n_choose_2 = 0.5*n*(n-1)
if self.cache_type is None:
return 0.0
elif self.cache_type == "np":
computed = np.count_nonzero(self.dist_cache >= 0)
return (computed - n)/n_choose_2
elif self.cache_type == "dict":
computed = len(self.dist_cache)
return (computed - n)/n_choose_2
[docs] def nearest_neighbors_slow(self, k):
r""" Compute k nearest-neighbors of the PointCloud, by brute-force.
Answers are cached in `self._nn[k]`
Parameters
----------
k: int
How many nearest neighbors to compute
Returns
-------
np array with dtype int and shape==(N,k+1). Entry [i,j] is the jth
nearest neighbor of vertex i. Note that entry [i,0] == i, so [i,k]
is the kth nearest neighbor.
Notes
-----
This method is intended for testing, and should only be used on small datasets.
On a random example with 1,000 points in :math:`\mathbb{R}^2:` seeking `k=5` nearest
neighbors, this method takes at least twice as long as :func:`nearest_neighbors`, and the
discrepency is roughly quadratic. On 2,000 points, it is about 4 times slower.
Examples
--------
>>> pc = PointCloud(np.array([[ 0.58814682, 0.45405299],
... [ 0.09197879, 0.39721367],
... [ 0.29128654, 0.28372039],
... [ 0.14593167, 0.7027367 ],
... [ 0.77068438, 0.37849037],
... [ 0.17281855, 0.70204687],
... [ 0.48146217, 0.54619034],
... [ 0.27831744, 0.67327757],
... [ 0.49074255, 0.70847318],
... [ 0.132656, 0.0860524 ]]))
>>> pc.nearest_neighbors_slow(3)
array([[0, 6, 4, 8],
[1, 2, 3, 9],
[2, 1, 9, 6],
[3, 5, 7, 1],
[4, 0, 6, 8],
[5, 3, 7, 1],
[6, 0, 8, 7],
[7, 5, 3, 8],
[8, 6, 7, 0],
[9, 2, 1, 6]])
See Also
--------
:func:`multidim.PointCloud.nearest_neighbors`
"""
# simple cache
if k in self._nn:
return self._nn[k]
num_points = self.coords.shape[0]
self._nn[k] = np.ndarray(shape=(num_points, k+1), dtype=np.int64)
all_points = self.coords.index.values
dists = self.dists(all_points, all_points)
self._nn[k] = dists.argsort(axis=1)[:, :k+1] # 0th entry is always self.
return self._nn[k]
[docs] def nearest_neighbors(self, k):
r""" Compute k nearest-neighbors of the PointCloud, using a clever CoverTree algorithm.
Answers are cached in `self._nn[k]`
Parameters
----------
k: int
How many nearest neighbors to compute
Returns
-------
np array with dtype int and shape==(N,k+1). Entry [i,j] is the jth
nearest neighbor of vertex i. Note that entry [i,0] == i, so [i,k]
is the kth nearest neighbor.
Examples
--------
>>> pc = PointCloud(np.array([[ 0.58814682, 0.45405299],
... [ 0.09197879, 0.39721367],
... [ 0.29128654, 0.28372039],
... [ 0.14593167, 0.7027367 ],
... [ 0.77068438, 0.37849037],
... [ 0.17281855, 0.70204687],
... [ 0.48146217, 0.54619034],
... [ 0.27831744, 0.67327757],
... [ 0.49074255, 0.70847318],
... [ 0.132656, 0.0860524 ]]))
>>> pc.nearest_neighbors(3)
array([[0, 6, 4, 8],
[1, 2, 3, 9],
[2, 1, 9, 6],
[3, 5, 7, 1],
[4, 0, 6, 8],
[5, 3, 7, 1],
[6, 0, 8, 7],
[7, 5, 3, 8],
[8, 6, 7, 0],
[9, 2, 1, 6]])
See Also
--------
:func:`multidim.PointCloud.nearest_neighbors_slow`
"""
from . import covertree
if self.covertree is None:
self.covertree = covertree.CoverTree(self)
# simple cache
if k in self._nn:
return self._nn[k]
num_points = self.coords.shape[0]
# -1 means "not found yet"
self._nn[k] = -np.ones(shape=(num_points, k+1), dtype=np.int64)
# Make sure we have the entire covertree.
levels = [ level for level in self.covertree ]
# run backwards:
for level in reversed(levels):
r = level.radius
for ci in level.adults:
for x in level.children[ci]:
unknown_neighbors = np.where(self._nn[k][x] < 0)[0]
if len(unknown_neighbors) > 0:
to_find = unknown_neighbors[0]
candidates = []
for cj in level.friends1[ci]:
candidates.extend(level.children[cj])
candidates = np.array(candidates)
num_found = min(k+1, len(candidates))
# don't bother computing lengths if there is nothing to
# learn
if num_found >= to_find:
dists = fast_algorithms.distance_cache_None(
np.array([x]), candidates,
self.coords.values).flatten()
order = dists.argsort()
self._nn[k][x, to_find:num_found] = candidates[order][to_find:num_found]
return self._nn[k]
#
# all_points = self.coords.index.values
# dists = self.dists(all_points, all_points)
# self._nn[k] = dists.argsort(axis=1)[:, :k+1] # 0th entry is always self.
# return self._nn[k]
[docs] def witnessed_barycenters(self, k):
r""" Build the PointCloud of k-witnessed barycenters, weighted by
distance-to-measure. This calls :func:`nearest_neighbors` with argument
:code:`(k-1)`, which can be slow.
Parameters
----------
k : int
How many vertices for each witnessed barycenter. That is, use
the (k-1) nearest neighbors, along with the vertex itself.
Returns
-------
pc : :class:`PointCloud`
A pointcloud whose 0-cells are the witnessed barycenters, and
whose 1-cells are the edges between
those barycenters, all weighted by the notion of distance to a
measure.
"""
n, d = self.coords.values.shape
# First, look at the indices to uniqify
polygons_indices = [tuple(np.sort(verts)) for verts in self.nearest_neighbors(k-1)]
polygons_indices = np.array(list(set(polygons_indices)))
p = polygons_indices.shape[0]
assert p <= n
# build the polygons from coordinates
polygons = []
for points in polygons_indices:
polygons.append(self.coords.values[points, :])
polygons = np.array(polygons)
# Find the vertex barycenters
assert polygons.shape == (p, k, d)
polygons = polygons.swapaxes(0, 1)
assert polygons.shape == (k, p, d)
barycenters = polygons.mean(axis=0)
assert barycenters.shape == (p, d)
# compute weights
diffs = polygons - barycenters
assert diffs.shape == (k, p, d)
norms = np.linalg.norm(diffs, axis=2)**2
assert norms.shape == (k, p)
weights = -norms.sum(axis=0)/k
assert weights.shape == (p,)
pcbc = PointCloud(barycenters,
heights=np.sqrt(-weights),
dist=self.dist)
pcbc.dists = squareform(pdist(pcbc.coords.values, pcbc.dist))
# make edges
hgt = []
pos = []
idx = []
bdy0 = []
bdy1 = []
for e, (i, j) in enumerate(itertools.combinations(pcbc.stratum[0].index.values, 2)):
idx.append(e)
bdy0.append(i)
bdy1.append(j)
pos.append(True)
mu = pcbc.dists[i, j]
wi = weights[i]
wj = weights[j]
r = np.sqrt(mu**2*(mu**2 - 2*wi - 2*wj) + (wi - wj)**2)/2/mu
hgt.append(r)
edges = pd.DataFrame({
'height': hgt,
'pos': pos,
'rep': idx,
'bdy0': bdy0,
'bdy1': bdy1,
},
columns=['height', 'pos', 'rep', 'bdy0', 'bdy1'],
index=idx)
pcbc.stratum[1] = edges
return pcbc
[docs] def unique_with_multiplicity(self):
r"""
Look for duplicate points, and mark their multiplicity.
This sets self.multiplicity
ToDo: Use Covertree.
Examples
--------
>>> a = np.array([[5.0, 2.0], [3.0, 4.0], [5.0, 2.0]])
>>> pc = PointCloud(a)
>>> b, counts = pc.unique_with_multiplicity()
>>> print(b)
[[ 3. 4.]
[ 5. 2.]]
>>> print(counts)
[1 2]
"""
coords = self.coords.values
assert coords.shape[1] == 2,\
"This uniqifying method can use only 2-dim data"
s = coords.shape
assert coords.dtype == 'float64'
coords.dtype = 'complex128'
tmp_coords, tmp_index, tmp_inverse, tmp_counts = np.unique(
coords.flatten(),
return_index=True,
return_inverse=True,
return_counts=True)
tmp_coords.dtype = 'float64'
coords.dtype = 'float64'
assert coords.shape == s
n = tmp_coords.shape[0]
d = coords.shape[1]
assert n % d == 0
tmp_coords.shape = (n//d, d)
return tmp_coords, tmp_counts
[docs] def dists(self, indices0, indices1):
r""" Compute distances points indices0 and indices1.
indices0 and indices1 must be 1-dimensional np arrays,
so use not "5" but "np.array([5])"
The return is a np array with shape == (len(indices0), len(indices1))
This uses the distance cache, depending on self.cache_type.
You can query the size of the cache with :func:`cache_usage`.
"""
# Allow boolean selectors
if indices0.dtype == np.uint8:
indices0 = np.where(indices0)[0]
if indices1.dtype == np.uint8:
indices1 = np.where(indices1)[0]
if self.cache_type is None:
return fast_algorithms.distance_cache_None(indices0,
indices1,
self.coords.values)
elif self.cache_type == "np":
N = self.coords.values.shape[0]
if self.dist_cache is None or self.dist_cache.shape != (N, N):
self.dist_cache = np.eye(N, dtype=np.float64) - 1.0
return fast_algorithms.distance_cache_numpy(indices0,
indices1,
self.coords.values,
self.dist_cache)
elif self.cache_type == "dict":
N = self.coords.values.shape[0]
if self.dist_cache is None:
self.dist_cache = dict(((i, i), np.float64(0.0)) for i in range(N))
return fast_algorithms.distance_cache_dict(indices0,
indices1,
self.coords.values,
self.dist_cache)
else:
raise ValueError("cache_type can be None or 'dict' or 'np'")
[docs] def cover_ball(self, point_index=None):
r""" Find a ball that covers the entire PointCloud.
Parameters
----------
point_index : int
If point_index is given, we use that point as the center.
If point_index is None (default), then we compute the point neareast
the center-of-mass, which requires an extra :func:`numpy.mean` and
:func:`scipy.spatial.distance.cdist` call.
Returns
-------
ball : dict with keys 'index' (index of designated center point),
'point' (coordinates of designated center point), and 'radius' (radius
of ball)
"""
if point_index is None:
center = self.coords.values.mean(axis=0)
center.shape = (1, center.shape[0]) # re-shape for cdist
center_dists = cdist(self.coords.values, center, metric=self.dist)
point_index = center_dists.argmin()
point = np.array([point_index])
indices = self.coords.index.values
point_dists = self.dists(point, indices).flatten()
return {'index': point_index,
'point': self.coords.values[point_index, :],
'radius': point_dists.max()}
[docs] @classmethod
def from_multisample_multilabel(cls, list_of_samples, list_of_labels,
equal_priors=True, normalize_domain=False):
r""" Produce a single labeled and weighted pointcloud from a list of samples
and labels of those samples.
Parameters
----------
list_of_samples :
A list (or np array) of np arrays. Each such array is
considered to be a sample of N points in R^d. N can vary between
entries, but d cannot.
list_of_labels :
A list of labels. Labels can be anything, but it is covenient to
use strings like "red" and "blue". list_of_labels[i] is the
label for the points in list_of_samples[i].
equal_priors:
Re-normalize weights so that each label is equally likely.
Default: True
normalize_domain:
Use SVD/PCA to re-shape the original data to be roughy spherical.
This should allow better learning via CDER.
Default: False
"""
assert len(list_of_labels) == len(list_of_samples),\
"list_of_labels must equal list_of arrays. {} != {}".format(
len(list_of_labels), len(list_of_samples))
ambient_dim = list_of_samples[0].shape[1]
assert all([X.shape[1] == ambient_dim for X in list_of_samples]),\
"Dimension mismatch among list_of_samples!"
label_info = pd.DataFrame(index=sorted(list(set(list(list_of_labels)))))
# ['blue', 'green', 'red']
# label_names = list(label_dict.keys())
# label_index = np.array
# num_labels = len(label_dict)
# count how many times each label occurs.
label_info['clouds'] = np.zeros(shape = (len(label_info),), dtype=np.int64)
label_info['points'] = np.zeros(shape = (len(label_info),), dtype=np.int64)
label_info['weight'] = np.zeros(shape = (len(label_info),), dtype=np.float64)
for label in label_info.index:
label_bool = np.array([l == label for l in list_of_labels])
label_info.loc[label, 'clouds'] = np.count_nonzero(label_bool)
label_info['int_index'] = label_info.index.get_indexer(label_info.index)
# merge the samples into one big dataset
# expand the sample-wise labels to point-wise labels
points = np.concatenate(list(list_of_samples))
if normalize_domain:
m,s,v = fast_algorithms.gaussian_fit(points)
points = np.dot((points - m), v.T)/s
pointwise_labels = [] # keep track of labels, pointwise
pointwise_source = []
pointwise_weight = []
for i, X in enumerate(list_of_samples):
l = list_of_labels[i]
num = X.shape[0]
assert num > 0, "bad? {}".format(X.shape)
pointwise_labels.extend([label_info['int_index'].loc[l]]*num)
pointwise_source.extend([i]*num)
if equal_priors:
wt = 1.0/num/label_info.loc[l, 'clouds']
else:
wt = 1.0
pointwise_weight.extend([wt]*num)
label_info.loc[l, 'points'] = np.int64(label_info.loc[l, 'points']) + np.int64(num)
label_info.loc[l, 'weight'] += wt*num
pointwise_labels = np.array(pointwise_labels)
pointwise_source = np.array(pointwise_source)
pointwise_weight = np.array(pointwise_weight)
pc = cls(points, masses=pointwise_weight)
pc.label_info = label_info
pc.labels = pointwise_labels
pc.source = pointwise_source
return pc