1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
// Copyright (C) 2024 The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information
//! Sets of values from 0 to a known maximum.
//!
//! This module contains a trait to use [`HashSet<usize>`](HashSet) and [`BitVec`]
//! interchangeably, and defines [`AdaptiveNodeSet`] which switches between the two
//! based on its content.
use std::collections::HashSet;
use sux::bits::bit_vec::BitVec;
/// Constant controlling when a [`AdaptiveNodeSet`] should be promoted from sparse to dense.
///
/// Promotion happens when the number of items in a [`AdaptiveNodeSet`] times this constant
/// is greater than the maximum value.
///
/// The value was computed experimentally, using "find-earliest-revision" (which runs
/// many BFSs of highly heterogeneous sizes) on the 2023-09-06 graph, running on
/// Software Heritage's Maxxi computer (Xeon Gold 6342 CPU @ 2.80GHz, 96 threads, 4TB RAM)
/// That graph contains 34 billion nodes, and performance increases continuously when
/// increasing up to 100 millions, then plateaus up to 1 billion; though the latter
/// uses a little less memory most of the time.
const PROMOTION_THRESHOLD: usize = 64;
pub type NodeId = usize;
/// A set of `usize` with a known maximum value
pub trait NodeSet {
fn insert(&mut self, node: NodeId);
fn contains(&self, node: NodeId) -> bool;
}
impl NodeSet for HashSet<usize> {
#[inline(always)]
fn insert(&mut self, node: usize) {
HashSet::insert(self, node);
}
#[inline(always)]
fn contains(&self, node: usize) -> bool {
HashSet::contains(self, &node)
}
}
impl NodeSet for BitVec {
#[inline(always)]
fn insert(&mut self, node: usize) {
self.set(node, true);
}
#[inline(always)]
fn contains(&self, node: usize) -> bool {
self.get(node)
}
}
/// Implementation of [`NodeSet`] that dynamically changes the underlying representation
/// based on its content
///
/// The current implementation is initialized with a [`HashSet`], but switches to
/// [`BitVec`] once the data becomes dense.
///
/// This has the advantage of allocating little memory if there won't be many elements,
/// but avoiding the overhead of [`HashSet`] when there are.
///
/// ```
/// # use swh_graph::collections::{AdaptiveNodeSet, NodeSet};
/// let mut node_set = AdaptiveNodeSet::new(100);
/// assert_eq!(format!("{:?}", node_set), "Sparse { max_items: 100, data: {} }");
/// node_set.insert(10);
/// assert_eq!(format!("{:?}", node_set), "Sparse { max_items: 100, data: {10} }");
/// for i in 20..30 {
/// node_set.insert(i);
/// }
/// assert_eq!(format!("{:?}", node_set), "Dense { data: BitVec { data: [1072694272, 0], len: 100 } }");
/// ```
#[derive(Debug)]
pub enum AdaptiveNodeSet {
Sparse {
max_items: usize,
data: HashSet<NodeId>,
},
Dense {
data: BitVec,
},
}
impl AdaptiveNodeSet {
/// Creates an empty `AdaptiveNodeSet` that may only store node ids from `0` to `max_items-1`
#[inline(always)]
pub fn new(max_items: usize) -> Self {
AdaptiveNodeSet::Sparse {
max_items,
data: HashSet::new(),
}
}
/// Creates an empty `AdaptiveNodeSet` with at least the specified capacity
#[inline(always)]
pub fn with_capacity(max_items: usize, capacity: usize) -> Self {
if capacity > max_items / PROMOTION_THRESHOLD {
AdaptiveNodeSet::Dense {
data: BitVec::new(max_items),
}
} else {
AdaptiveNodeSet::Sparse {
max_items,
data: HashSet::new(),
}
}
}
}
impl NodeSet for AdaptiveNodeSet {
/// Adds a node to the set
///
/// # Panics
///
/// If `node` is larger or equal to the `max_items` value passed to [`AdaptiveNodeSet::new`].
#[inline(always)]
fn insert(&mut self, node: usize) {
match self {
AdaptiveNodeSet::Sparse { max_items, data } => {
data.insert(node);
if data.len() > *max_items / PROMOTION_THRESHOLD {
// Promote the hashset to a bitvec
let mut new_data = BitVec::new(*max_items);
for node in data.iter() {
new_data.insert(*node);
}
*self = AdaptiveNodeSet::Dense { data: new_data };
}
}
AdaptiveNodeSet::Dense { data } => data.insert(node),
}
}
/// Returns whether the node is part of the set
///
/// # Panics
///
/// If `node` is larger or equal to the `max_items` value passed to [`AdaptiveNodeSet::new`].
#[inline(always)]
fn contains(&self, node: usize) -> bool {
match self {
AdaptiveNodeSet::Sparse { max_items: _, data } => data.contains(&node),
AdaptiveNodeSet::Dense { data } => data.contains(node),
}
}
}