1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
// Copyright (C) 2024 The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information
use std::ops::{Add, Range};
use rand::prelude::*;
use rayon::prelude::*;
/// Returns a parallel iterator on the range where items are pseudo-shuffled
///
/// When running an operation on all nodes of the graph, the naive approach is
/// to use `(0..graph.num_nodes()).into_par_iter().map(f)`.
///
/// However, nodes are not uniformly distributed over the range of ids, meaning that
/// some sections of the range of ids can be harder for `f` than others.
/// This can be an issue when some instances are harder than others, or simply because
/// it makes ETAs unreliable.
///
/// Instead, this function returns the same set of nodes as the naive approach would,
/// but in a somewhat-random order.
///
/// # Example
///
/// ```
/// use rayon::prelude::*;
/// use swh_graph::utils::shuffle::par_iter_shuffled_range;
///
/// let mut integers: Vec<usize> = par_iter_shuffled_range(0..10000).collect();
/// assert_ne!(integers, (0..10000).collect::<Vec<_>>(), "integers are still sorted");
/// integers.sort();
/// assert_eq!(integers, (0..10000).collect::<Vec<_>>(), "integers do not match the input range");
/// ```
pub fn par_iter_shuffled_range<Item>(
range: Range<Item>,
) -> impl ParallelIterator<Item = <Range<Item> as IntoParallelIterator>::Item>
where
Range<Item>: ExactSizeIterator,
usize: Add<Item, Output = Item>,
Item: Ord + Sync + Send + Copy,
std::ops::Range<Item>: rayon::iter::IntoParallelIterator,
{
let num_chunks = 100_000; // Arbitrary value
let chunk_size = range.len().div_ceil(num_chunks);
let mut chunks: Vec<usize> = (0..num_chunks).collect();
chunks.shuffle(&mut rand::thread_rng());
chunks
.into_par_iter()
// Lazily rebuild the list of nodes from the shuffled chunks
.flat_map(move |chunk_id| {
((chunk_id * chunk_size) + range.start)
..Item::min(
(chunk_id.checked_add(1).unwrap()) * chunk_size + range.start,
range.end,
)
})
}