Thank you all for your help. I think I solved this by including all the suggestions.
I use numpy to import geographic coordinates and then design them using "France Lambert - 93". This allows me to fill in scipy.spatial.cKDTree with points, and then calculate sparse_distance_matrix, indicating a cutoff of 50 km (my projected points are indicated in meters). Then I retrieve the extraction of the lower triangle in the CSV.
import numpy as np import csv import time from pyproj import Proj, transform #http://epsg.io/2154 (accuracy: 1.0m) fr = '+proj=lcc +lat_1=49 +lat_2=44 +lat_0=46.5 +lon_0=3 \ +x_0=700000 +y_0=6600000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 \ +units=m +no_defs' #http://epsg.io/27700-5339 (accuracy: 1.0m) uk = '+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 \ +x_0=400000 +y_0=-100000 +ellps=airy \ +towgs84=446.448,-125.157,542.06,0.15,0.247,0.842,-20.489 +units=m +no_defs' path_to_csv = '.../raw_in.csv' out_csv = '.../out.csv' def proj_arr(points): inproj = Proj(init='epsg:4326') outproj = Proj(uk) # origin|destination|lon|lat func = lambda x: transform(inproj,outproj,x[2],x[1]) return np.array(list(map(func, points))) tstart = time.time() # Import points as geographic coordinates # ID|lat|lon #Sample to try and replicate #points = np.array([ # [39007,46.585012,5.5857829], # [88086,48.192370,6.7296289], # [62627,50.309155,3.0218611], # [14020,49.133972,-0.15851507], # [1091, 42.981765,2.0104902]]) # points = np.genfromtxt(path_to_csv, delimiter=',', skip_header=1) print("Total points: %d" % len(points)) print("Triangular matrix contains: %d" % (len(points)*((len(points))-1)*0.5)) # Get projected co-ordinates proj_pnts = proj_arr(points) # Fill quad-tree from scipy.spatial import cKDTree tree = cKDTree(proj_pnts) cut_off_metres = 1600 tree_dist = tree.sparse_distance_matrix(tree, max_distance=cut_off_metres, p=2) # Extract triangle from scipy import sparse udist = sparse.tril(tree_dist, k=-1) # zero the main diagonal print("Distances after quad-tree cut-off: %d " % len(udist.data)) # Export CSV import csv f = open(out_csv, 'w', newline='') w = csv.writer(f, delimiter=",", ) w.writerow(['id_a','lat_a','lon_a','id_b','lat_b','lon_b','metres']) w.writerows(np.column_stack((points[udist.row ], points[udist.col], udist.data))) f.close() """ Get ID labels """ id_to_csv = '...id.csv' id_labels = np.genfromtxt(id_to_csv, delimiter=',', skip_header=1, dtype='U') """ Try vincenty on the un-projected co-ordinates """ from geopy.distance import vincenty vout_csv = '.../out_vin.csv' test_vin = np.column_stack((points[udist.row].T[1:3].T, points[udist.col].T[1:3].T)) func = lambda x: vincenty(x[0:2],x[2:4]).m output = list(map(func,test_vin)) # Export CSV f = open(vout_csv, 'w', newline='') w = csv.writer(f, delimiter=",", ) w.writerow(['id_a','id_a2', 'lat_a','lon_a', 'id_b','id_b2', 'lat_b','lon_b', 'proj_metres','vincenty_metres']) w.writerows(np.column_stack((list(id_labels[udist.row]), points[udist.row ], list(id_labels[udist.col]), points[udist.col], udist.data, output, ))) f.close() print("Finished in %.0f seconds" % (time.time()-tstart)
This approach took 164 seconds to generate (for 5,306,434 distances) - compared to 9 - and also about 90 seconds to save to disk.
Then I compared the difference in vincenty distance and hypotenuse distance (from projected coordinates).
The average difference in meters was 2.7, and the average difference / meters was 0.0073% - which looks great.