Source code for loci.analytics

import pandas as pd
import geopandas as gpd
from shapely.geometry import box, GeometryCollection
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori


[docs]def filter_by_kwd(df, kwd_filter, col_kwds='kwds'): """Returns a DataFrame with only those rows that contain the specified keyword. Args: df (DataFrame): The initial DataFrame to be filtered. kwd_filter (string): The keyword to use for filtering. col_kwds (string): Name of the column containing the keywords (default: `kwds`). Returns: A GeoDataFrame with only those rows that contain `kwd_filter`. """ mask = df[col_kwds].apply(lambda x: kwd_filter.lower() in [y.lower() for y in x]) filtered_gdf = df[mask] return filtered_gdf
[docs]def bbox(gdf): """Computes the bounding box of a GeoDataFrame. Args: gdf (GeoDataFrame): A GeoDataFrame. Returns: A Polygon representing the bounding box enclosing all geometries in the GeoDataFrame. """ minx, miny, maxx, maxy = gdf.geometry.total_bounds return box(minx, miny, maxx, maxy)
[docs]def kwds_freq(gdf, col_kwds='kwds', normalized=False): """Computes the frequency of keywords in the provided GeoDataFrame. Args: gdf (GeoDataFrame): A GeoDataFrame with a keywords column. col_kwds (string) : The column containing the list of keywords (default: `kwds`). normalized (bool): If True, the returned frequencies are normalized in [0,1] by dividing with the number of rows in `gdf` (default: False). Returns: A dictionary containing for each keyword the number of rows it appears in. """ kwds_ser = gdf[col_kwds] kwds_freq_dict = dict() for (index, kwds) in kwds_ser.iteritems(): for kwd in kwds: if kwd in kwds_freq_dict: kwds_freq_dict[kwd] += 1 else: kwds_freq_dict[kwd] = 1 num_of_records = kwds_ser.size if normalized: for(kwd, freq) in kwds_freq_dict.items(): kwds_freq_dict[kwd] = freq / num_of_records return kwds_freq_dict
[docs]def freq_locationsets(location_visits, location_id_col, locations, locationset_id_col, min_sup, min_length): """Computes frequently visited sets of locations based on frequent itemset mining. Args: location_visits (DataFrame): A DataFrame with location ids and locationset ids. location_id_col (String): The name of the column containing the location ids. locationset_id_col (String): The name of the column containing the locationsets ids. locations (GeoDataFrame): A GeoDataFrame containing the geometries of the locations. min_sup (float): The minimum support threshold. min_length (int): Minimum length of itemsets to be returned. Returns: A GeoDataFrame with the support, length and geometry of the computed location sets. """ itemsets = location_visits.groupby([locationset_id_col], sort=False)[location_id_col].agg(set) te = TransactionEncoder() oht_ary = te.fit(itemsets).transform(itemsets.values, sparse=True) sparse_df = pd.SparseDataFrame(oht_ary, columns=te.columns_, default_fill_value=False) apriori_df = apriori(sparse_df, min_support=min_sup, use_colnames=True) apriori_df['length'] = apriori_df['itemsets'].apply(lambda x: len(x)) apriori_df = apriori_df[(apriori_df['length'] >= min_length)] def cluster_id_to_geom(row): polylist = [locations.loc[c].geometry for c in row] return GeometryCollection(polylist) apriori_df['geometry'] = apriori_df['itemsets'].apply(lambda x: cluster_id_to_geom(x)) apriori_df = gpd.GeoDataFrame(apriori_df, crs=locations.crs, geometry=apriori_df.geometry) apriori_df.rename(columns={'itemsets': 'location_ids'}, inplace=True) return apriori_df