GeographicLib  2.6
NearestNeighbor.hpp
Go to the documentation of this file.
1 /**
2  * \file NearestNeighbor.hpp
3  * \brief Header for GeographicLib::NearestNeighbor class
4  *
5  * Copyright (c) Charles Karney (2016-2020) <karney@alum.mit.edu> and licensed
6  * under the MIT/X11 License. For more information, see
7  * https://geographiclib.sourceforge.io/
8  **********************************************************************/
9 
10 #if !defined(GEOGRAPHICLIB_NEARESTNEIGHBOR_HPP)
11 #define GEOGRAPHICLIB_NEARESTNEIGHBOR_HPP 1
12 
13 #include <algorithm> // for nth_element, max_element, etc.
14 #include <vector>
15 #include <queue> // for priority_queue
16 #include <utility> // for swap + pair
17 #include <cstring>
18 #include <limits>
19 #include <cmath>
20 #include <sstream>
21 // Only for GeographicLib::GeographicErr
23 
24 #if defined(GEOGRAPHICLIB_HAVE_BOOST_SERIALIZATION) && \
25  GEOGRAPHICLIB_HAVE_BOOST_SERIALIZATION
26 #include <boost/serialization/nvp.hpp>
27 #include <boost/serialization/split_member.hpp>
28 #include <boost/serialization/array.hpp>
29 #include <boost/serialization/vector.hpp>
30 #endif
31 
32 namespace GeographicLib {
33 
34  /**
35  * \brief Nearest-neighbor calculations
36  *
37  * This class solves the nearest-neighbor problm using a vantage-point tree
38  * as described in \ref nearest.
39  *
40  * This class is templated so that it can handle arbitrary metric spaces as
41  * follows:
42  *
43  * @tparam dist_t the type used for measuring distances; it can be a real or
44  * signed integer type; in typical geodetic applications, \e dist_t might
45  * be <code>double</code>.
46  * @tparam pos_t the type for specifying the positions of points; geodetic
47  * application might bundle the latitude and longitude into a
48  * <code>std::pair<dist_t, dist_t></code>.
49  * @tparam distfun_t the type of a function object which takes takes two
50  * positions (of type \e pos_t) and returns the distance (of type \e
51  * dist_t); in geodetic applications, this might be a class which is
52  * constructed with a Geodesic object and which implements a member
53  * function with a signature <code>dist_t operator() (const pos_t&, const
54  * pos_t&) const</code>, which returns the geodesic distance between two
55  * points.
56  *
57  * \note The distance measure must satisfy the triangle inequality, \f$
58  * d(a,c) \le d(a,b) + d(b,c) \f$ for all points \e a, \e b, \e c. The
59  * geodesic distance (given by Geodesic::Inverse) does, while the great
60  * ellipse distance and the rhumb line distance <i>do not</i>. If you use
61  * the ordinary Euclidean distance, i.e., \f$ \sqrt{(x_a-x_b)^2 +
62  * (y_a-y_b)^2} \f$ for two dimensions, don't be tempted to leave out the
63  * square root in the interests of "efficiency"; the squared distance does
64  * not satisfy the triangle inequality!
65  *
66  * \note This is a "header-only" implementation and, as such, depends in a
67  * minimal way on the rest of GeographicLib (the only dependency is through
68  * the use of GeographicLib::GeographicErr for handling and run-time
69  * exceptions). Therefore, it is easy to extract this class from the rest of
70  * GeographicLib and use it as a stand-alone facility.
71  *
72  * The \e dist_t type must support numeric_limits queries (specifically:
73  * is_signed, is_integer, max(), digits).
74  *
75  * The NearestNeighbor object is constructed with a vector of points (type \e
76  * pos_t) and a distance function (type \e distfun_t). However the object
77  * does \e not store the points. When querying the object with Search(),
78  * it's necessary to supply the same vector of points and the same distance
79  * function.
80  *
81  * There's no capability in this implementation to add or remove points from
82  * the set. Instead Initialize() should be called to re-initialize the
83  * object with the modified vector of points.
84  *
85  * Because of the overhead in constructing a NearestNeighbor object for a
86  * large set of points, functions Save() and Load() are provided to save the
87  * object to an external file. operator<<(), operator>>() and <a
88  * href="https://www.boost.org/libs/serialization/doc"> Boost
89  * serialization</a> can also be used to save and restore a NearestNeighbor
90  * object. This is illustrated in the example.
91  *
92  * Example of use:
93  * \include example-NearestNeighbor.cpp
94  **********************************************************************/
95  template<typename dist_t, typename pos_t, class distfun_t>
97  // For tracking changes to the I/O format
98  static const int version = 1;
99  // This is what we get "free"; but if sizeof(dist_t) = 1 (unlikely), allow
100  // 4 slots (and this accommodates the default value bucket = 4).
101  static const int maxbucket =
102  (2 + ((4 * sizeof(dist_t)) / sizeof(int) >= 2 ?
103  (4 * sizeof(dist_t)) / sizeof(int) : 2));
104  public:
105 
106  /**
107  * Default constructor for NearestNeighbor.
108  *
109  * This is equivalent to specifying an empty set of points.
110  **********************************************************************/
111  NearestNeighbor() : _numpoints(0), _bucket(0), _cost(0) {}
112 
113  /**
114  * Constructor for NearestNeighbor.
115  *
116  * @param[in] pts a vector of points to include in the set.
117  * @param[in] dist the distance function object.
118  * @param[in] bucket the size of the buckets at the leaf nodes; this must
119  * lie in [0, 2 + 4*sizeof(dist_t)/sizeof(int)] (default 4).
120  * @exception GeographicErr if the value of \e bucket is out of bounds or
121  * the size of \e pts is too big for an int.
122  * @exception std::bad_alloc if memory for the tree can't be allocated.
123  *
124  * \e pts may contain coincident points (i.e., the distance between them
125  * vanishes); these are treated as distinct.
126  *
127  * The choice of \e bucket is a tradeoff between space and efficiency. A
128  * larger \e bucket decreases the size of the NearestNeighbor object which
129  * scales as pts.size() / max(1, bucket) and reduces the number of distance
130  * calculations to construct the object by log2(bucket) * pts.size().
131  * However each search then requires about bucket additional distance
132  * calculations.
133  *
134  * \warning The distances computed by \e dist must satisfy the standard
135  * metric conditions. If not, the results are undefined. Neither the data
136  * in \e pts nor the query points should contain NaNs or infinities because
137  * such data violates the metric conditions.
138  *
139  * \warning The same arguments \e pts and \e dist must be provided
140  * to the Search() function.
141  **********************************************************************/
142  NearestNeighbor(const std::vector<pos_t>& pts, const distfun_t& dist,
143  int bucket = 4) {
144  Initialize(pts, dist, bucket);
145  }
146 
147  /**
148  * Initialize or re-initialize NearestNeighbor.
149  *
150  * @param[in] pts a vector of points to include in the tree.
151  * @param[in] dist the distance function object.
152  * @param[in] bucket the size of the buckets at the leaf nodes; this must
153  * lie in [0, 2 + 4*sizeof(dist_t)/sizeof(int)] (default 4).
154  * @exception GeographicErr if the value of \e bucket is out of bounds or
155  * the size of \e pts is too big for an int.
156  * @exception std::bad_alloc if memory for the tree can't be allocated.
157  *
158  * See also the documentation on the constructor.
159  *
160  * If an exception is thrown, the state of the NearestNeighbor is
161  * unchanged.
162  **********************************************************************/
163  void Initialize(const std::vector<pos_t>& pts, const distfun_t& dist,
164  int bucket = 4) {
165  static_assert(std::numeric_limits<dist_t>::is_signed,
166  "dist_t must be a signed type");
167  if (!( 0 <= bucket && bucket <= maxbucket ))
169  ("bucket must lie in [0, 2 + 4*sizeof(dist_t)/sizeof(int)]");
170  if (pts.size() > size_t(std::numeric_limits<int>::max()))
171  throw GeographicLib::GeographicErr("pts array too big");
172  // the pair contains distance+id
173  std::vector<item> ids(pts.size());
174  for (int k = int(ids.size()); k--;)
175  ids[k] = std::make_pair(dist_t(0), k);
176  int cost = 0;
177  std::vector<Node> tree;
178  init(pts, dist, bucket, tree, ids, cost,
179  0, int(ids.size()), int(ids.size()/2));
180  _tree.swap(tree);
181  _numpoints = int(pts.size());
182  _bucket = bucket;
183  _mc = _sc = 0;
184  _cost = cost; _c1 = _k = _cmax = 0;
185  _cmin = std::numeric_limits<int>::max();
186  }
187 
188  /**
189  * Search the NearestNeighbor.
190  *
191  * @param[in] pts the vector of points used for initialization.
192  * @param[in] dist the distance function object used for initialization.
193  * @param[in] query the query point.
194  * @param[out] ind a vector of indices to the closest points found.
195  * @param[in] k the number of points to search for (default = 1).
196  * @param[in] maxdist only return points with distances of \e maxdist or
197  * less from \e query (default is the maximum \e dist_t).
198  * @param[in] mindist only return points with distances of more than
199  * \e mindist from \e query (default = &minus;1).
200  * @param[in] exhaustive whether to do an exhaustive search (default true).
201  * @param[in] tol the tolerance on the results (default 0).
202  * @return the distance to the closest point found (&minus;1 if no points
203  * are found).
204  * @exception GeographicErr if \e pts has a different size from that used
205  * to construct the object.
206  *
207  * The indices returned in \e ind are sorted by distance from \e query
208  * (closest first).
209  *
210  * The simplest invocation is with just the 4 non-optional arguments. This
211  * returns the closest distance and the index to the closest point in
212  * <i>ind</i><sub>0</sub>. If there are several points equally close, then
213  * <i>ind</i><sub>0</sub> gives the index of an arbirary one of them. If
214  * there's no closest point (because the set of points is empty), then \e
215  * ind is empty and &minus;1 is returned.
216  *
217  * With \e exhaustive = true and \e tol = 0 (their default values), this
218  * finds the indices of \e k closest neighbors to \e query whose distances
219  * to \e query are in (\e mindist, \e maxdist]. If \e mindist and \e
220  * maxdist have their default values, then these bounds have no effect. If
221  * \e query is one of the points in the tree, then set \e mindist = 0 to
222  * prevent this point (and other coincident points) from being returned.
223  *
224  * If \e exhaustive = false, exit as soon as \e k results satisfying the
225  * distance criteria are found. If less than \e k results are returned
226  * then the search was exhaustive even if \e exhaustive = false.
227  *
228  * If \e tol is positive, do an approximate search; in this case the
229  * results are to be interpreted as follows: if the <i>k</i>'th distance is
230  * \e dk, then all results with distances less than or equal \e dk &minus;
231  * \e tol are correct; all others are suspect &mdash; there may be other
232  * closer results with distances greater or equal to \e dk &minus; \e tol.
233  * If less than \e k results are found, then the search is exact.
234  *
235  * \e mindist should be used to exclude a "small" neighborhood of the query
236  * point (relative to the average spacing of the data). If \e mindist is
237  * large, the efficiency of the search deteriorates.
238  *
239  * \note Only the shortest distance is returned (as as the function value).
240  * The distances to other points (indexed by <i>ind</i><sub><i>j</i></sub>
241  * for \e j > 0) can be found by invoking \e dist again.
242  *
243  * \warning The arguments \e pts and \e dist must be identical to those
244  * used to initialize the NearestNeighbor; if not, this function will
245  * return some meaningless result (however, if the size of \e pts is wrong,
246  * this function throw an exception).
247  *
248  * \warning The query point cannot be a NaN or infinite because then the
249  * metric conditions are violated.
250  **********************************************************************/
251  dist_t Search(const std::vector<pos_t>& pts, const distfun_t& dist,
252  const pos_t& query,
253  std::vector<int>& ind,
254  int k = 1,
255  dist_t maxdist = std::numeric_limits<dist_t>::max(),
256  dist_t mindist = -1,
257  bool exhaustive = true,
258  dist_t tol = 0) const {
259  if (_numpoints != int(pts.size()))
260  throw GeographicLib::GeographicErr("pts array has wrong size");
261  std::priority_queue<item> results;
262  if (_numpoints > 0 && k > 0 && maxdist > mindist) {
263  // distance to the kth closest point so far
264  dist_t tau = maxdist;
265  // first is negative of how far query is outside boundary of node
266  // +1 if on boundary or inside
267  // second is node index
268  std::priority_queue<item> todo;
269  todo.push(std::make_pair(dist_t(1), int(_tree.size()) - 1));
270  int c = 0;
271  while (!todo.empty()) {
272  int n = todo.top().second;
273  dist_t d = -todo.top().first;
274  todo.pop();
275  dist_t tau1 = tau - tol;
276  // compare tau and d again since tau may have become smaller.
277  if (!( n >= 0 && tau1 >= d )) continue;
278  const Node& current = _tree[n];
279  dist_t dst = 0; // to suppress warning about uninitialized variable
280  bool exitflag = false, leaf = current.index < 0;
281  for (int i = 0; i < (leaf ? _bucket : 1); ++i) {
282  int index = leaf ? current.leaves[i] : current.index;
283  if (index < 0) break;
284  dst = dist(pts[index], query);
285  ++c;
286 
287  if (dst > mindist && dst <= tau) {
288  if (int(results.size()) == k) results.pop();
289  results.push(std::make_pair(dst, index));
290  if (int(results.size()) == k) {
291  if (exhaustive)
292  tau = results.top().first;
293  else {
294  exitflag = true;
295  break;
296  }
297  if (tau <= tol) {
298  exitflag = true;
299  break;
300  }
301  }
302  }
303  }
304  if (exitflag) break;
305 
306  if (current.index < 0) continue;
307  tau1 = tau - tol;
308  for (int l = 0; l < 2; ++l) {
309  if (current.data.child[l] >= 0 &&
310  dst + current.data.upper[l] >= mindist) {
311  if (dst < current.data.lower[l]) {
312  d = current.data.lower[l] - dst;
313  if (tau1 >= d)
314  todo.push(std::make_pair(-d, current.data.child[l]));
315  } else if (dst > current.data.upper[l]) {
316  d = dst - current.data.upper[l];
317  if (tau1 >= d)
318  todo.push(std::make_pair(-d, current.data.child[l]));
319  } else
320  todo.push(std::make_pair(dist_t(1), current.data.child[l]));
321  }
322  }
323  }
324  ++_k;
325  _c1 += c;
326  double omc = _mc;
327  _mc += (c - omc) / _k;
328  _sc += (c - omc) * (c - _mc);
329  if (c > _cmax) _cmax = c;
330  if (c < _cmin) _cmin = c;
331  }
332 
333  dist_t d = -1;
334  ind.resize(results.size());
335 
336  for (int i = int(ind.size()); i--;) {
337  ind[i] = int(results.top().second);
338  if (i == 0) d = results.top().first;
339  results.pop();
340  }
341  return d;
342 
343  }
344 
345  /**
346  * @return the total number of points in the set.
347  **********************************************************************/
348  int NumPoints() const { return _numpoints; }
349 
350  /**
351  * Write the object to an I/O stream.
352  *
353  * @param[in,out] os the stream to write to.
354  * @param[in] bin if true (the default) save in binary mode.
355  * @exception std::bad_alloc if memory for the string representation of the
356  * object can't be allocated.
357  *
358  * The counters tracking the statistics of searches are not saved; however
359  * the initializtion cost is saved. The format of the binary saves is \e
360  * not portable.
361  *
362  * \note <a href="https://www.boost.org/libs/serialization/doc">
363  * Boost serialization</a> can also be used to save and restore a
364  * NearestNeighbor object. This requires that the
365  * GEOGRAPHICLIB_HAVE_BOOST_SERIALIZATION macro be defined.
366  **********************************************************************/
367  void Save(std::ostream& os, bool bin = true) const {
368  int realspec = std::numeric_limits<dist_t>::digits *
369  (std::numeric_limits<dist_t>::is_integer ? -1 : 1);
370  if (bin) {
371  char id[] = "NearestNeighbor_";
372  os.write(id, 16);
373  int buf[6];
374  buf[0] = version;
375  buf[1] = realspec;
376  buf[2] = _bucket;
377  buf[3] = _numpoints;
378  buf[4] = int(_tree.size());
379  buf[5] = _cost;
380  os.write(reinterpret_cast<const char *>(buf), 6 * sizeof(int));
381  for (int i = 0; i < int(_tree.size()); ++i) {
382  const Node& node = _tree[i];
383  os.write(reinterpret_cast<const char *>(&node.index), sizeof(int));
384  if (node.index >= 0) {
385  os.write(reinterpret_cast<const char *>(node.data.lower),
386  2 * sizeof(dist_t));
387  os.write(reinterpret_cast<const char *>(node.data.upper),
388  2 * sizeof(dist_t));
389  os.write(reinterpret_cast<const char *>(node.data.child),
390  2 * sizeof(int));
391  } else {
392  os.write(reinterpret_cast<const char *>(node.leaves),
393  _bucket * sizeof(int));
394  }
395  }
396  } else {
397  std::stringstream ostring;
398  // Ensure enough precision for type dist_t. With C++11, max_digits10
399  // can be used instead.
400  if (!std::numeric_limits<dist_t>::is_integer) {
401  static const int prec
402  = int(std::ceil(std::numeric_limits<dist_t>::digits *
403  std::log10(2.0) + 1));
404  ostring.precision(prec);
405  }
406  ostring << version << " " << realspec << " " << _bucket << " "
407  << _numpoints << " " << _tree.size() << " " << _cost;
408  for (int i = 0; i < int(_tree.size()); ++i) {
409  const Node& node = _tree[i];
410  ostring << "\n" << node.index;
411  if (node.index >= 0) {
412  for (int l = 0; l < 2; ++l)
413  ostring << " " << node.data.lower[l] << " " << node.data.upper[l]
414  << " " << node.data.child[l];
415  } else {
416  for (int l = 0; l < _bucket; ++l)
417  ostring << " " << node.leaves[l];
418  }
419  }
420  os << ostring.str();
421  }
422  }
423 
424  /**
425  * Read the object from an I/O stream.
426  *
427  * @param[in,out] is the stream to read from
428  * @param[in] bin if true (the default) load in binary mode.
429  * @exception GeographicErr if the state read from \e is is illegal.
430  * @exception std::bad_alloc if memory for the tree can't be allocated.
431  *
432  * The counters tracking the statistics of searches are reset by this
433  * operation. Binary data must have been saved on a machine with the same
434  * architecture. If an exception is thrown, the state of the
435  * NearestNeighbor is unchanged.
436  *
437  * \note <a href="https://www.boost.org/libs/serialization/doc">
438  * Boost serialization</a> can also be used to save and restore a
439  * NearestNeighbor object. This requires that the
440  * GEOGRAPHICLIB_HAVE_BOOST_SERIALIZATION macro be defined.
441  *
442  * \warning The same arguments \e pts and \e dist used for
443  * initialization must be provided to the Search() function.
444  **********************************************************************/
445  void Load(std::istream& is, bool bin = true) {
446  int version1, realspec, bucket, numpoints, treesize, cost;
447  if (bin) {
448  char id[17];
449  is.read(id, 16);
450  id[16] = '\0';
451  if (!(std::strcmp(id, "NearestNeighbor_") == 0))
452  throw GeographicLib::GeographicErr("Bad ID");
453  is.read(reinterpret_cast<char *>(&version1), sizeof(int));
454  is.read(reinterpret_cast<char *>(&realspec), sizeof(int));
455  is.read(reinterpret_cast<char *>(&bucket), sizeof(int));
456  is.read(reinterpret_cast<char *>(&numpoints), sizeof(int));
457  is.read(reinterpret_cast<char *>(&treesize), sizeof(int));
458  is.read(reinterpret_cast<char *>(&cost), sizeof(int));
459  } else {
460  if (!( is >> version1 >> realspec >> bucket >> numpoints >> treesize
461  >> cost ))
462  throw GeographicLib::GeographicErr("Bad header");
463  }
464  if (!( version1 == version ))
465  throw GeographicLib::GeographicErr("Incompatible version");
466  if (!( realspec == std::numeric_limits<dist_t>::digits *
467  (std::numeric_limits<dist_t>::is_integer ? -1 : 1) ))
468  throw GeographicLib::GeographicErr("Different dist_t types");
469  if (!( 0 <= bucket && bucket <= maxbucket ))
470  throw GeographicLib::GeographicErr("Bad bucket size");
471  if (!( 0 <= treesize && treesize <= numpoints ))
472  throw
473  GeographicLib::GeographicErr("Bad number of points or tree size");
474  if (!( 0 <= cost ))
475  throw GeographicLib::GeographicErr("Bad value for cost");
476  std::vector<Node> tree;
477  tree.reserve(treesize);
478  for (int i = 0; i < treesize; ++i) {
479  Node node;
480  if (bin) {
481  is.read(reinterpret_cast<char *>(&node.index), sizeof(int));
482  if (node.index >= 0) {
483  is.read(reinterpret_cast<char *>(node.data.lower),
484  2 * sizeof(dist_t));
485  is.read(reinterpret_cast<char *>(node.data.upper),
486  2 * sizeof(dist_t));
487  is.read(reinterpret_cast<char *>(node.data.child),
488  2 * sizeof(int));
489  } else {
490  is.read(reinterpret_cast<char *>(node.leaves),
491  bucket * sizeof(int));
492  for (int l = bucket; l < maxbucket; ++l)
493  node.leaves[l] = 0;
494  }
495  } else {
496  if (!( is >> node.index ))
497  throw GeographicLib::GeographicErr("Bad index");
498  if (node.index >= 0) {
499  for (int l = 0; l < 2; ++l) {
500  if (!( is >> node.data.lower[l] >> node.data.upper[l]
501  >> node.data.child[l] ))
502  throw GeographicLib::GeographicErr("Bad node data");
503  }
504  } else {
505  // Must be at least one valid leaf followed by a sequence end
506  // markers (-1).
507  for (int l = 0; l < bucket; ++l) {
508  if (!( is >> node.leaves[l] ))
509  throw GeographicLib::GeographicErr("Bad leaf data");
510  }
511  for (int l = bucket; l < maxbucket; ++l)
512  node.leaves[l] = 0;
513  }
514  }
515  node.Check(numpoints, treesize, bucket);
516  tree.push_back(node);
517  }
518  _tree.swap(tree);
519  _numpoints = numpoints;
520  _bucket = bucket;
521  _mc = _sc = 0;
522  _cost = cost; _c1 = _k = _cmax = 0;
523  _cmin = std::numeric_limits<int>::max();
524  }
525 
526  /**
527  * Write the object to stream \e os as text.
528  *
529  * @param[in,out] os the output stream.
530  * @param[in] t the NearestNeighbor object to be saved.
531  * @exception std::bad_alloc if memory for the string representation of the
532  * object can't be allocated.
533  **********************************************************************/
534  friend std::ostream& operator<<(std::ostream& os, const NearestNeighbor& t)
535  { t.Save(os, false); return os; }
536 
537  /**
538  * Read the object from stream \e is as text.
539  *
540  * @param[in,out] is the input stream.
541  * @param[out] t the NearestNeighbor object to be loaded.
542  * @exception GeographicErr if the state read from \e is is illegal.
543  * @exception std::bad_alloc if memory for the tree can't be allocated.
544  **********************************************************************/
545  friend std::istream& operator>>(std::istream& is, NearestNeighbor& t)
546  { t.Load(is, false); return is; }
547 
548  /**
549  * Swap with another NearestNeighbor object.
550  *
551  * @param[in,out] t the NearestNeighbor object to swap with.
552  **********************************************************************/
554  std::swap(_numpoints, t._numpoints);
555  std::swap(_bucket, t._bucket);
556  std::swap(_cost, t._cost);
557  _tree.swap(t._tree);
558  std::swap(_mc, t._mc);
559  std::swap(_sc, t._sc);
560  std::swap(_c1, t._c1);
561  std::swap(_k, t._k);
562  std::swap(_cmin, t._cmin);
563  std::swap(_cmax, t._cmax);
564  }
565 
566  /**
567  * The accumulated statistics on the searches so far.
568  *
569  * @param[out] setupcost the cost of initializing the NearestNeighbor.
570  * @param[out] numsearches the number of calls to Search().
571  * @param[out] searchcost the total cost of the calls to Search().
572  * @param[out] mincost the minimum cost of a Search().
573  * @param[out] maxcost the maximum cost of a Search().
574  * @param[out] mean the mean cost of a Search().
575  * @param[out] sd the standard deviation in the cost of a Search().
576  *
577  * Here "cost" measures the number of distance calculations needed. Note
578  * that the accumulation of statistics is \e not thread safe.
579  **********************************************************************/
580  void Statistics(int& setupcost, int& numsearches, int& searchcost,
581  int& mincost, int& maxcost,
582  double& mean, double& sd) const {
583  setupcost = _cost; numsearches = _k; searchcost = _c1;
584  mincost = _cmin; maxcost = _cmax;
585  mean = _mc; sd = std::sqrt(_sc / (_k - 1));
586  }
587 
588  /**
589  * Reset the counters for the accumulated statistics on the searches so
590  * far.
591  **********************************************************************/
592  void ResetStatistics() const {
593  _mc = _sc = 0;
594  _c1 = _k = _cmax = 0;
595  _cmin = std::numeric_limits<int>::max();
596  }
597 
598  private:
599  // Package up a dist_t and an int. We will want to sort on the dist_t so
600  // put it first.
601  typedef std::pair<dist_t, int> item;
602  // \cond SKIP
603  class Node {
604  public:
605  struct bounds {
606  dist_t lower[2], upper[2]; // bounds on inner/outer distances
607  int child[2];
608  };
609  union {
610  bounds data;
611  int leaves[maxbucket];
612  };
613  int index;
614 
615  Node()
616  : index(-1)
617  {
618  for (int i = 0; i < 2; ++i) {
619  data.lower[i] = data.upper[i] = 0;
620  data.child[i] = -1;
621  }
622  }
623 
624  // Sanity check on a Node
625  void Check(int numpoints, int treesize, int bucket) const {
626  if (!( -1 <= index && index < numpoints ))
627  throw GeographicLib::GeographicErr("Bad index");
628  if (index >= 0) {
629  if (!( -1 <= data.child[0] && data.child[0] < treesize &&
630  -1 <= data.child[1] && data.child[1] < treesize ))
631  throw GeographicLib::GeographicErr("Bad child pointers");
632  if (!( 0 <= data.lower[0] && data.lower[0] <= data.upper[0] &&
633  data.upper[0] <= data.lower[1] &&
634  data.lower[1] <= data.upper[1] ))
635  throw GeographicLib::GeographicErr("Bad bounds");
636  } else {
637  // Must be at least one valid leaf followed by a sequence end markers
638  // (-1).
639  bool start = true;
640  for (int l = 0; l < bucket; ++l) {
641  if (!( (start ?
642  ((l == 0 ? 0 : -1) <= leaves[l] && leaves[l] < numpoints) :
643  leaves[l] == -1) ))
644  throw GeographicLib::GeographicErr("Bad leaf data");
645  start = leaves[l] >= 0;
646  }
647  for (int l = bucket; l < maxbucket; ++l) {
648  if (leaves[l] != 0)
649  throw GeographicLib::GeographicErr("Bad leaf data");
650  }
651  }
652  }
653 
654 #if defined(GEOGRAPHICLIB_HAVE_BOOST_SERIALIZATION) && \
655  GEOGRAPHICLIB_HAVE_BOOST_SERIALIZATION
656  friend class boost::serialization::access;
657  template<class Archive>
658  void save(Archive& ar, const unsigned int) const {
659  ar & boost::serialization::make_nvp("index", index);
660  if (index < 0)
661  ar & boost::serialization::make_nvp("leaves", leaves);
662  else
663  ar & boost::serialization::make_nvp("lower", data.lower)
664  & boost::serialization::make_nvp("upper", data.upper)
665  & boost::serialization::make_nvp("child", data.child);
666  }
667  template<class Archive>
668  void load(Archive& ar, const unsigned int) {
669  ar & boost::serialization::make_nvp("index", index);
670  if (index < 0)
671  ar & boost::serialization::make_nvp("leaves", leaves);
672  else
673  ar & boost::serialization::make_nvp("lower", data.lower)
674  & boost::serialization::make_nvp("upper", data.upper)
675  & boost::serialization::make_nvp("child", data.child);
676  }
677  template<class Archive>
678  void serialize(Archive& ar, const unsigned int file_version)
679  { boost::serialization::split_member(ar, *this, file_version); }
680 #endif
681  };
682  // \endcond
683 #if defined(GEOGRAPHICLIB_HAVE_BOOST_SERIALIZATION) && \
684  GEOGRAPHICLIB_HAVE_BOOST_SERIALIZATION
685  friend class boost::serialization::access;
686  template<class Archive> void save(Archive& ar, const unsigned) const {
687  int realspec = std::numeric_limits<dist_t>::digits *
688  (std::numeric_limits<dist_t>::is_integer ? -1 : 1);
689  // Need to use version1, otherwise load error in debug mode on Linux:
690  // undefined reference to GeographicLib::NearestNeighbor<...>::version.
691  int version1 = version;
692  ar & boost::serialization::make_nvp("version", version1)
693  & boost::serialization::make_nvp("realspec", realspec)
694  & boost::serialization::make_nvp("bucket", _bucket)
695  & boost::serialization::make_nvp("numpoints", _numpoints)
696  & boost::serialization::make_nvp("cost", _cost)
697  & boost::serialization::make_nvp("tree", _tree);
698  }
699  template<class Archive> void load(Archive& ar, const unsigned) {
700  int version1, realspec, bucket, numpoints, cost;
701  ar & boost::serialization::make_nvp("version", version1);
702  if (version1 != version)
703  throw GeographicLib::GeographicErr("Incompatible version");
704  std::vector<Node> tree;
705  ar & boost::serialization::make_nvp("realspec", realspec);
706  if (!( realspec == std::numeric_limits<dist_t>::digits *
707  (std::numeric_limits<dist_t>::is_integer ? -1 : 1) ))
708  throw GeographicLib::GeographicErr("Different dist_t types");
709  ar & boost::serialization::make_nvp("bucket", bucket);
710  if (!( 0 <= bucket && bucket <= maxbucket ))
711  throw GeographicLib::GeographicErr("Bad bucket size");
712  ar & boost::serialization::make_nvp("numpoints", numpoints)
713  & boost::serialization::make_nvp("cost", cost)
714  & boost::serialization::make_nvp("tree", tree);
715  if (!( 0 <= int(tree.size()) && int(tree.size()) <= numpoints ))
716  throw
717  GeographicLib::GeographicErr("Bad number of points or tree size");
718  for (int i = 0; i < int(tree.size()); ++i)
719  tree[i].Check(numpoints, int(tree.size()), bucket);
720  _tree.swap(tree);
721  _numpoints = numpoints;
722  _bucket = bucket;
723  _mc = _sc = 0;
724  _cost = cost; _c1 = _k = _cmax = 0;
725  _cmin = std::numeric_limits<int>::max();
726  }
727  template<class Archive>
728  void serialize(Archive& ar, const unsigned int file_version)
729  { boost::serialization::split_member(ar, *this, file_version); }
730 #endif
731 
732  int _numpoints, _bucket, _cost;
733  std::vector<Node> _tree;
734  // Counters to track stastistics on the cost of searches
735  mutable double _mc, _sc;
736  mutable int _c1, _k, _cmin, _cmax;
737 
738  int init(const std::vector<pos_t>& pts, const distfun_t& dist, int bucket,
739  std::vector<Node>& tree, std::vector<item>& ids, int& cost,
740  int l, int u, int vp) {
741 
742  if (u == l)
743  return -1;
744  Node node;
745 
746  if (u - l > (bucket == 0 ? 1 : bucket)) {
747 
748  // choose a vantage point and move it to the start
749  int i = vp;
750  std::swap(ids[l], ids[i]);
751 
752  int m = (u + l + 1) / 2;
753 
754  for (int k = l + 1; k < u; ++k) {
755  ids[k].first = dist(pts[ids[l].second], pts[ids[k].second]);
756  ++cost;
757  }
758  // partition around the median distance
759  std::nth_element(ids.begin() + l + 1,
760  ids.begin() + m,
761  ids.begin() + u);
762  node.index = ids[l].second;
763  if (m > l + 1) { // node.child[0] is possibly empty
764  typename std::vector<item>::iterator
765  t = std::min_element(ids.begin() + l + 1, ids.begin() + m);
766  node.data.lower[0] = t->first;
767  t = std::max_element(ids.begin() + l + 1, ids.begin() + m);
768  node.data.upper[0] = t->first;
769  // Use point with max distance as vantage point; this point act as a
770  // "corner" point and leads to a good partition.
771  node.data.child[0] = init(pts, dist, bucket, tree, ids, cost,
772  l + 1, m, int(t - ids.begin()));
773  }
774  typename std::vector<item>::iterator
775  t = std::max_element(ids.begin() + m, ids.begin() + u);
776  node.data.lower[1] = ids[m].first;
777  node.data.upper[1] = t->first;
778  // Use point with max distance as vantage point here too
779  node.data.child[1] = init(pts, dist, bucket, tree, ids, cost,
780  m, u, int(t - ids.begin()));
781  } else {
782  if (bucket == 0)
783  node.index = ids[l].second;
784  else {
785  node.index = -1;
786  // Sort the bucket entries so that the tree is independent of the
787  // implementation of nth_element.
788  std::sort(ids.begin() + l, ids.begin() + u);
789  for (int i = l; i < u; ++i)
790  node.leaves[i-l] = ids[i].second;
791  for (int i = u - l; i < bucket; ++i)
792  node.leaves[i] = -1;
793  for (int i = bucket; i < maxbucket; ++i)
794  node.leaves[i] = 0;
795  }
796  }
797 
798  tree.push_back(node);
799  return int(tree.size()) - 1;
800  }
801 
802  };
803 
804 } // namespace GeographicLib
805 
806 namespace std {
807 
808  /**
809  * Swap two GeographicLib::NearestNeighbor objects.
810  *
811  * @tparam dist_t the type used for measuring distances.
812  * @tparam pos_t the type for specifying the positions of points.
813  * @tparam distfun_t the type for a function object which calculates
814  * distances between points.
815  * @param[in,out] a the first GeographicLib::NearestNeighbor to swap.
816  * @param[in,out] b the second GeographicLib::NearestNeighbor to swap.
817  **********************************************************************/
818  template<typename dist_t, typename pos_t, class distfun_t>
821  a.swap(b);
822  }
823 
824 } // namespace std
825 
826 #endif // GEOGRAPHICLIB_NEARESTNEIGHBOR_HPP
void Statistics(int &setupcost, int &numsearches, int &searchcost, int &mincost, int &maxcost, double &mean, double &sd) const
NearestNeighbor(const std::vector< pos_t > &pts, const distfun_t &dist, int bucket=4)
void Load(std::istream &is, bool bin=true)
dist_t Search(const std::vector< pos_t > &pts, const distfun_t &dist, const pos_t &query, std::vector< int > &ind, int k=1, dist_t maxdist=std::numeric_limits< dist_t >::max(), dist_t mindist=-1, bool exhaustive=true, dist_t tol=0) const
Namespace for GeographicLib.
Definition: Accumulator.cpp:12
friend std::ostream & operator<<(std::ostream &os, const NearestNeighbor &t)
void swap(GeographicLib::NearestNeighbor< dist_t, pos_t, distfun_t > &a, GeographicLib::NearestNeighbor< dist_t, pos_t, distfun_t > &b)
friend std::istream & operator>>(std::istream &is, NearestNeighbor &t)
Exception handling for GeographicLib.
Definition: Constants.hpp:344
Header for GeographicLib::Constants class.
void Initialize(const std::vector< pos_t > &pts, const distfun_t &dist, int bucket=4)
void swap(NearestNeighbor &t)
Nearest-neighbor calculations.
void Save(std::ostream &os, bool bin=true) const