GeoPandas: trova il punto più vicino in altri frame di dati

20

Ho 2 geodataframes:

import geopandas as gpd
from shapely.geometry import Point
gpd1 = gpd.GeoDataFrame([['John',1,Point(1,1)],['Smith',1,Point(2,2)],['Soap',1,Point(0,2)]],columns=['Name','ID','geometry'])
gpd2 = gpd.GeoDataFrame([['Work',Point(0,1.1)],['Shops',Point(2.5,2)],['Home',Point(1,1.1)]],columns=['Place','geometry'])

e voglio trovare il nome del punto più vicino in gpd2 per ogni riga in gpd1:

desired_output = 

    Name  ID     geometry  Nearest
0   John   1  POINT (1 1)     Home
1  Smith   1  POINT (2 2)    Shops
2   Soap   1  POINT (0 2)     Work

Ho provato a farlo funzionare usando una funzione lambda:

gpd1['Nearest'] = gpd1.apply(lambda row: min_dist(row.geometry,gpd2)['Place'] , axis=1)

con

def min_dist(point, gpd2):

    geoseries = some_function()
    return geoseries

— redM
fonte

Questo metodo ha funzionato per me: stackoverflow.com/questions/37402046/… guarda il link

— Johnny Cheesecutter

16

È possibile utilizzare direttamente la funzione Shapely Punti più vicini (le geometrie di GeoSeries sono geometrie Shapely):

from shapely.ops import nearest_points
# unary union of the gpd2 geomtries 
pts3 = gpd2.geometry.unary_union
def near(point, pts=pts3):
     # find the nearest point and return the corresponding Place value
     nearest = gpd2.geometry == nearest_points(point, pts)[1]
     return gpd2[nearest].Place.get_values()[0]
gpd1['Nearest'] = gpd1.apply(lambda row: near(row.geometry), axis=1)
gpd1
    Name  ID     geometry  Nearest
0   John   1  POINT (1 1)     Home
1  Smith   1  POINT (2 2)    Shops
2   Soap   1  POINT (0 2)     Work

Spiegazione

for i, row in gpd1.iterrows():
    print nearest_points(row.geometry, pts3)[0], nearest_points(row.geometry, pts3)[1]
 POINT (1 1) POINT (1 1.1)
 POINT (2 2) POINT (2.5 2)
 POINT (0 2) POINT (0 1.1)

— gene
fonte

Qualcosa non funziona per me e non riesco a capirlo. La funzione restituisce una GeoSeries vuota anche se la geometria è solida. Ad esempio: sample_point = gpd2.geometry.unary_union[400] / sample_point in gpd2.geometry questo restituisce True. gpd2.geometry == sample_point Questo esce tutto falso.

— robroc,

Aggiunta a sopra: gpd2.geometry.geom_equals(sample_point)funziona.

— robroc,

13

Se disponi di frame di dati di grandi dimensioni, ho scoperto che scipyil .querymetodo dell'indice spaziale cKDTree restituisce risultati molto rapidi per le ricerche dei vicini più vicini. Poiché utilizza un indice spaziale, gli ordini di grandezza sono più veloci del looping del frame di dati e quindi della ricerca del minimo di tutte le distanze. È anche più veloce dell'uso di shapely nearest_pointscon RTree (il metodo dell'indice spaziale disponibile tramite geopandas) perché cKDTree ti consente di vettorializzare la tua ricerca, mentre l'altro metodo no.

Ecco una funzione di supporto che restituirà la distanza e il 'Nome' del vicino più vicino gpd2da ogni punto in gpd1. Presuppone che entrambi i gdf abbiano una geometrycolonna (di punti).

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)], ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', Point(0, 1.1)], ['Shops', Point(2.5, 2)],
                         ['Home', Point(1, 1.1)]],
                        columns=['Place', 'geometry'])

def ckdnearest(gdA, gdB):
    nA = np.array(list(zip(gdA.geometry.x, gdA.geometry.y)) )
    nB = np.array(list(zip(gdB.geometry.x, gdB.geometry.y)) )
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdf = pd.concat(
        [gdA, gdB.loc[idx, gdB.columns != 'geometry'].reset_index(),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

ckdnearest(gpd1, gpd2)

E se vuoi trovare il punto più vicino a un LineString, ecco un esempio completo funzionante:

import itertools
from operator import itemgetter

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
                         ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
                         ['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
                         ['Home',  LineString([Point(101, 0), Point(102, 1)])]],
                        columns=['Place', 'geometry'])


def ckdnearest(gdfA, gdfB, gdfB_cols=['Place']):
    A = np.concatenate(
        [np.array(geom.coords) for geom in gdfA.geometry.to_list()])
    B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
    B_ix = tuple(itertools.chain.from_iterable(
        [itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
    B = np.concatenate(B)
    ckd_tree = cKDTree(B)
    dist, idx = ckd_tree.query(A, k=1)
    idx = itemgetter(*idx)(B_ix)
    gdf = pd.concat(
        [gdfA, gdfB.loc[idx, gdfB_cols].reset_index(drop=True),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

c = ckdnearest(gpd1, gpd2)

— JHuw
fonte

È possibile assegnare anche il punto più vicino sulla linea, usando questo metodo? Ad esempio per scattare una posizione GPS sulla strada più vicina.

— hyperknot,

Questa risposta è fantastica! Tuttavia, il codice per i punti più vicini alla linea produce un bug per me. Sembra che venga restituita la distanza corretta dalla linea più vicina per ciascun punto, ma l'id della linea che viene restituito è errato. Penso che sia il calcolo dell'idx, ma sono abbastanza nuovo in Python, quindi non riesco a girarci attorno.

— Shakedk

1

Capito:

def min_dist(point, gpd2):
    gpd2['Dist'] = gpd2.apply(lambda row:  point.distance(row.geometry),axis=1)
    geoseries = gpd2.iloc[gpd2['Dist'].argmin()]
    return geoseries

Naturalmente alcune critiche sono benvenute. Non sono un fan del ricalcolo di gpd2 ['Dist'] per ogni riga di gpd1 ...

— redM
fonte

1

La risposta di Gene non ha funzionato per me. Alla fine ho scoperto che gpd2.geometry.unary_union ha prodotto una geometria che conteneva solo circa 30.000 del mio totale di circa 150.000 punti. Per chiunque abbia riscontrato lo stesso problema, ecco come l'ho risolto:

    from shapely.ops import nearest_points
    from shapely.geometry import MultiPoint

    gpd2_pts_list = gpd2.geometry.tolist()
    gpd2_pts = MultiPoint(gpd2_pts_list)
    def nearest(point, gpd2_pts, gpd2=gpd2, geom_col='geometry', src_col='Place'):
         # find the nearest point
         nearest_point = nearest_points(point, gpd2_pts)[1]
         # return the corresponding value of the src_col of the nearest point
         value = gpd2[gpd2[geom_col] == nearest_point][src_col].get_values()[0]
         return value

    gpd1['Nearest'] = gpd1.apply(lambda x: nearest(x.geometry, gpd2_pts), axis=1)

— Inske
fonte

0

Per chiunque abbia errori di indicizzazione con i propri dati mentre utilizza l' eccellente risposta di @ JHuw , il mio problema era che i miei indici non si allineavano. Ripristinare l'indice di gdfA e gdfB ha risolto i miei problemi, forse questo può aiutarti anche tu @ Shakedk .

import itertools
from operator import itemgetter

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
                         ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
                         ['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
                         ['Home',  LineString([Point(101, 0), Point(102, 1)])]],
                        columns=['Place', 'geometry'])


def ckdnearest(gdfA, gdfB, gdfB_cols=['Place']):
    # resetting the index of gdfA and gdfB here.
    gdfA = gdfA.reset_index(drop=True)
    gdfB = gdfB.reset_index(drop=True)
    A = np.concatenate(
        [np.array(geom.coords) for geom in gdfA.geometry.to_list()])
    B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
    B_ix = tuple(itertools.chain.from_iterable(
        [itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
    B = np.concatenate(B)
    ckd_tree = cKDTree(B)
    dist, idx = ckd_tree.query(A, k=1)
    idx = itemgetter(*idx)(B_ix)
    gdf = pd.concat(
        [gdfA, gdfB.loc[idx, gdfB_cols].reset_index(drop=True),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

c = ckdnearest(gpd1, gpd2)

— Markus Rosenfelder
fonte