1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// Copyright (C) 2024  The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information

//! Sets of values from 0 to a known maximum.
//!
//! This module contains a trait to use [`HashSet<usize>`](HashSet) and [`BitVec`]
//! interchangeably, and defines [`AdaptiveNodeSet`] which switches between the two
//! based on its content.

use std::collections::HashSet;

use sux::bits::bit_vec::BitVec;

/// Constant controlling when a [`AdaptiveNodeSet`] should be promoted from sparse to dense.
///
/// Promotion happens when the number of items in a [`AdaptiveNodeSet`] times this constant
/// is greater than the maximum value.
///
/// The value was computed experimentally, using "find-earliest-revision" (which runs
/// many BFSs of highly heterogeneous sizes) on the 2023-09-06 graph, running on
/// Software Heritage's Maxxi computer (Xeon Gold 6342 CPU @ 2.80GHz, 96 threads, 4TB RAM)
/// That graph contains 34 billion nodes, and performance increases continuously when
/// increasing up to 100 millions, then plateaus up to 1 billion; though the latter
/// uses a little less memory most of the time.
const PROMOTION_THRESHOLD: usize = 64;

pub type NodeId = usize;

/// A set of `usize` with a known maximum value
pub trait NodeSet {
    fn insert(&mut self, node: NodeId);
    fn contains(&self, node: NodeId) -> bool;
}

impl NodeSet for HashSet<usize> {
    #[inline(always)]
    fn insert(&mut self, node: usize) {
        HashSet::insert(self, node);
    }
    #[inline(always)]
    fn contains(&self, node: usize) -> bool {
        HashSet::contains(self, &node)
    }
}

impl NodeSet for BitVec {
    #[inline(always)]
    fn insert(&mut self, node: usize) {
        self.set(node, true);
    }
    #[inline(always)]
    fn contains(&self, node: usize) -> bool {
        self.get(node)
    }
}

/// Implementation of [`NodeSet`] that dynamically changes the underlying representation
/// based on its content
///
/// The current implementation is initialized with a [`HashSet`], but switches to
/// [`BitVec`] once the data becomes dense.
///
/// This has the advantage of allocating little memory if there won't be many elements,
/// but avoiding the overhead of [`HashSet`] when there are.
///
/// ```
/// # use swh_graph::collections::{AdaptiveNodeSet, NodeSet};
/// let mut node_set = AdaptiveNodeSet::new(100);
/// assert_eq!(format!("{:?}", node_set), "Sparse { max_items: 100, data: {} }");
/// node_set.insert(10);
/// assert_eq!(format!("{:?}", node_set), "Sparse { max_items: 100, data: {10} }");
/// for i in 20..30 {
///     node_set.insert(i);
/// }
/// assert_eq!(format!("{:?}", node_set), "Dense { data: BitVec { data: [1072694272, 0], len: 100 } }");
/// ```
#[derive(Debug)]
pub enum AdaptiveNodeSet {
    Sparse {
        max_items: usize,
        data: HashSet<NodeId>,
    },
    Dense {
        data: BitVec,
    },
}

impl AdaptiveNodeSet {
    /// Creates an empty `AdaptiveNodeSet` that may only store node ids from `0` to `max_items-1`
    #[inline(always)]
    pub fn new(max_items: usize) -> Self {
        AdaptiveNodeSet::Sparse {
            max_items,
            data: HashSet::new(),
        }
    }

    /// Creates an empty `AdaptiveNodeSet` with at least the specified capacity
    #[inline(always)]
    pub fn with_capacity(max_items: usize, capacity: usize) -> Self {
        if capacity > max_items / PROMOTION_THRESHOLD {
            AdaptiveNodeSet::Dense {
                data: BitVec::new(max_items),
            }
        } else {
            AdaptiveNodeSet::Sparse {
                max_items,
                data: HashSet::new(),
            }
        }
    }
}

impl NodeSet for AdaptiveNodeSet {
    /// Adds a node to the set
    ///
    /// # Panics
    ///
    /// If `node` is larger or equal to the `max_items` value passed to [`AdaptiveNodeSet::new`].
    #[inline(always)]
    fn insert(&mut self, node: usize) {
        match self {
            AdaptiveNodeSet::Sparse { max_items, data } => {
                data.insert(node);
                if data.len() > *max_items / PROMOTION_THRESHOLD {
                    // Promote the hashset to a bitvec
                    let mut new_data = BitVec::new(*max_items);
                    for node in data.iter() {
                        new_data.insert(*node);
                    }
                    *self = AdaptiveNodeSet::Dense { data: new_data };
                }
            }
            AdaptiveNodeSet::Dense { data } => data.insert(node),
        }
    }

    /// Returns whether the node is part of the set
    ///
    /// # Panics
    ///
    /// If `node` is larger or equal to the `max_items` value passed to [`AdaptiveNodeSet::new`].
    #[inline(always)]
    fn contains(&self, node: usize) -> bool {
        match self {
            AdaptiveNodeSet::Sparse { max_items: _, data } => data.contains(&node),
            AdaptiveNodeSet::Dense { data } => data.contains(node),
        }
    }
}