Source code for loci.clustering

import pandas as pd
import geopandas

from pandas import merge
from time import time
from sklearn.cluster import DBSCAN
import numpy as np
from shapely.ops import cascaded_union
from geopandas import GeoDataFrame
from hdbscan import HDBSCAN
from shapely.geometry import MultiPoint


[docs]def compute_clusters(pois, alg='dbscan', min_pts=None, eps=None, n_jobs=1): """Computes clusters using the DBSCAN or the HDBSCAN algorithm. Args: pois (GeoDataFrame): A POI GeoDataFrame. alg (string): The clustering algorithm to use (dbscan or hdbscan; default: dbscan). min_pts (integer): The minimum number of neighbors for a dense point. eps (float): The neighborhood radius. n_jobs (integer): Number of parallel jobs to run in the algorithm (default: 1) Returns: A GeoDataFrame containing the clustered POIs and their labels. The value of parameter `eps` for each cluster is also returned (which varies in the case of HDBSCAN). """ # Prepare list of coordinates poi_list = [[p.x, p.y] for p in pois['geometry']] data_arr = np.array(poi_list) del poi_list[:] # Compute the clusters t0 = time() if alg == 'hdbscan': clusterer = HDBSCAN(min_cluster_size=min_pts, min_samples=min_pts, core_dist_n_jobs=n_jobs) labels = clusterer.fit_predict(data_arr) num_of_clusters = len(set(labels)) tree = clusterer.condensed_tree_.to_pandas() cluster_tree = tree[tree.child_size > 1] chosen_clusters = clusterer.condensed_tree_._select_clusters() eps_per_cluster = cluster_tree[cluster_tree.child.isin(chosen_clusters)].\ drop("parent", axis=1).drop("child", axis=1).reset_index().drop("index", axis=1) eps_per_cluster['lambda_val'] = eps_per_cluster['lambda_val'].apply(lambda x: 1 / x) eps_per_cluster.rename(columns={'lambda_val': 'eps', 'child_size': 'cluster_size'}, inplace=True) else: clusterer = DBSCAN(eps=eps, min_samples=min_pts, n_jobs=n_jobs).fit(data_arr) labels = clusterer.labels_ num_of_clusters = len(set(labels)) num_of_clusters_no_noise = set(labels) num_of_clusters_no_noise.discard(-1) num_of_clusters_no_noise = len(num_of_clusters_no_noise) eps_per_cluster = pd.DataFrame({'eps': [eps] * num_of_clusters_no_noise}) eps_per_cluster['cluster_size'] = 0 print("Done in %0.3fs." % (time() - t0)) # Assign cluster labels to initial POIs pois['cluster_id'] = labels # Separate POIs that are inside clusters from those that are noise pois_in_clusters = pois.loc[pois['cluster_id'] > -1] pois_noise = pois.loc[pois['cluster_id'] == -1] print('Number of clusters: %d' % num_of_clusters) print('Number of clustered POIs: %d' % (len(pois_in_clusters))) print('Number of outlier POIs: %d' % (len(pois_noise))) return pois_in_clusters, eps_per_cluster
[docs]def cluster_shapes(pois, shape_type=1, eps_per_cluster=None): """Computes cluster shapes. Args: pois (GeoDataFrame): The clustered POIs. shape_type (integer): The methods to use for computing cluster shapes (allowed values: 1-3). eps_per_cluster (DataFrame): The value of parameter eps used for each cluster (required by methods 2 and 3). Returns: A GeoDataFrame containing the cluster shapes. """ t0 = time() if shape_type == 2: cluster_borders = pois.groupby(['cluster_id'], sort=False)['geometry'].agg([list, np.size]) join_df = merge(cluster_borders, eps_per_cluster, left_index=True, right_index=True, how='inner') cluster_list = [] for index, row in join_df.iterrows(): eps = row['eps'] cluster_i = [] for p in row['list']: cluster_i.append(p.buffer(eps)) cluster_list.append(cascaded_union(cluster_i)) join_df['geometry'] = cluster_list join_df['cluster_id'] = join_df.index join_df.reset_index(drop=True, inplace=True) join_df.drop(['list', 'cluster_size'], axis=1, inplace=True) cluster_borders = GeoDataFrame(join_df, crs=pois.crs, geometry='geometry') cluster_borders = cluster_borders[['cluster_id', 'size', 'geometry']] elif shape_type == 3: eps_dict = dict() for index, row in eps_per_cluster.iterrows(): eps_dict[index] = row['eps'] circles_from_pois = pois.copy() cid_size_dict = dict() circles = [] for index, row in circles_from_pois.iterrows(): cid = row['cluster_id'] circles.append(row['geometry'].buffer(eps_dict[cid])) cid_size_dict[cid] = cid_size_dict.get(cid, 0) + 1 circles_from_pois['geometry'] = circles s_index = pois.sindex pois_in_circles = geopandas.sjoin(pois, circles_from_pois, how="inner", op='intersects') agged_pois_per_circle = pois_in_circles.groupby(['cluster_id_left', 'index_right'], sort=False)['geometry'].agg([list]) poly_list = [] cluster_id_list = [] for index, row in agged_pois_per_circle.iterrows(): pois_in_circle = row['list'] lsize = len(pois_in_circle) if lsize >= 3: poly = MultiPoint(pois_in_circle).convex_hull poly_list.append(poly) cluster_id_list.append(index[0]) temp_df = pd.DataFrame({ 'cluster_id': cluster_id_list, 'geometry': poly_list }) grouped_poly_per_cluster = temp_df.groupby(['cluster_id'], sort=False)['geometry'].agg([list]) cluster_size_list = [] poly_list = [] for index, row in grouped_poly_per_cluster.iterrows(): poly_list.append(cascaded_union(row['list'])) cluster_size_list.append(cid_size_dict[index]) grouped_poly_per_cluster['geometry'] = poly_list grouped_poly_per_cluster.drop(['list'], axis=1, inplace=True) cluster_borders = GeoDataFrame(grouped_poly_per_cluster, crs=pois.crs, geometry='geometry') cluster_borders['cluster_id'] = cluster_borders.index cluster_borders['size'] = cluster_size_list # type == 1 (default) else: cluster_borders = pois.groupby(['cluster_id'], sort=False)['geometry'].agg([list, np.size]) cluster_borders['list'] = [MultiPoint(l).convex_hull for l in cluster_borders['list']] cluster_borders.rename(columns={"list": "geometry"}, inplace=True) cluster_borders.sort_index(inplace=True) cluster_borders = GeoDataFrame(cluster_borders, crs=pois.crs, geometry='geometry') cluster_borders.reset_index(inplace=True) print("Done in %0.3fs." % (time() - t0)) return cluster_borders