1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
// Copyright (C) 2023-2024  The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information

use crate::{OutOfBoundError, SWHID};
use anyhow::{Context, Result};
use mmap_rs::{Mmap, MmapFlags, MmapMut};

/// Struct to load a `.node2swhid.bin` file and convert node ids to SWHIDs.
pub struct Node2SWHID<B> {
    data: B,
}

impl Node2SWHID<Mmap> {
    /// Load a `.node2swhid.bin` file
    pub fn load<P: AsRef<std::path::Path>>(path: P) -> Result<Self> {
        let path = path.as_ref();
        let file_len = path
            .metadata()
            .with_context(|| format!("Could not stat {}", path.display()))?
            .len();
        let file = std::fs::File::open(path)
            .with_context(|| format!("Could not open {}", path.display()))?;
        let data = unsafe {
            mmap_rs::MmapOptions::new(file_len as _)
                .context("Could not initialize mmap")?
                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES)
                .with_file(file, 0)
                .map()
                .with_context(|| format!("Could not mmap {}", path.display()))?
        };
        #[cfg(target_os = "linux")]
        unsafe {
            libc::madvise(data.as_ptr() as *mut _, data.len(), libc::MADV_RANDOM)
        };
        Ok(Self { data })
    }
}

impl Node2SWHID<MmapMut> {
    /// Create a new `.node2swhid.bin` file
    pub fn new<P: AsRef<std::path::Path>>(path: P, num_nodes: usize) -> Result<Self> {
        let path = path.as_ref();
        let file_len = (num_nodes * SWHID::BYTES_SIZE)
            .try_into()
            .context("File size overflowed u64")?;
        let file = std::fs::File::options()
            .read(true)
            .write(true)
            .create(true)
            .open(path)
            .with_context(|| format!("Could not create {}", path.display()))?;

        // fallocate the file with zeros so we can fill it without ever resizing it
        file.set_len(file_len)
            .with_context(|| format!("Could not fallocate {} with zeros", path.display()))?;

        let data = unsafe {
            mmap_rs::MmapOptions::new(file_len as _)
                .context("Could not initialize mmap")?
                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES)
                .with_file(file, 0)
                .map_mut()
                .with_context(|| format!("Could not mmap {}", path.display()))?
        };
        Ok(Self { data })
    }
}

impl Node2SWHID<Vec<u8>> {
    pub fn new_from_iter(swhids: impl ExactSizeIterator<Item = SWHID>) -> Self {
        let file_len = swhids.len() * SWHID::BYTES_SIZE;
        let data = vec![0; file_len];
        let mut node2swhid = Node2SWHID { data };
        for (i, swhid) in swhids.enumerate() {
            node2swhid.set(i, swhid);
        }
        node2swhid
    }
}

impl<B: AsRef<[u8]>> Node2SWHID<B> {
    /// Convert a node_id to a SWHID
    ///
    /// # Safety
    /// This function is unsafe because it does not check that `node_id` is
    /// within bounds of the array if debug asserts are disabled
    #[inline]
    pub unsafe fn get_unchecked(&self, node_id: usize) -> SWHID {
        let offset = node_id * SWHID::BYTES_SIZE;
        let bytes = self
            .data
            .as_ref()
            .get_unchecked(offset..offset + SWHID::BYTES_SIZE);
        // this unwrap is always safe because we use the same const
        let bytes: [u8; SWHID::BYTES_SIZE] = bytes.try_into().unwrap();
        // this unwrap can only fail on a corrupted file, so it's ok to panic
        SWHID::try_from(bytes).unwrap()
    }

    /// Convert a node_id to a SWHID
    #[inline]
    pub fn get(&self, node_id: usize) -> Result<SWHID, OutOfBoundError> {
        let offset = node_id * SWHID::BYTES_SIZE;
        let bytes = self
            .data
            .as_ref()
            .get(offset..offset + SWHID::BYTES_SIZE)
            .ok_or(OutOfBoundError {
                index: node_id,
                len: self.data.as_ref().len() / SWHID::BYTES_SIZE,
            })?;
        // this unwrap is always safe because we use the same const
        let bytes: [u8; SWHID::BYTES_SIZE] = bytes.try_into().unwrap();
        // this unwrap can only fail on a corrupted file, so it's ok to panic
        Ok(SWHID::try_from(bytes).unwrap())
    }

    /// Return how many node_ids are in this map
    #[allow(clippy::len_without_is_empty)] // rationale: we don't care about empty maps
    #[inline]
    pub fn len(&self) -> usize {
        self.data.as_ref().len() / SWHID::BYTES_SIZE
    }
}

impl<B: AsMut<[u8]> + AsRef<[u8]>> Node2SWHID<B> {
    /// Set a node_id to map to a given SWHID, without checking bounds
    ///
    /// # Safety
    /// This function is unsafe because it does not check that `node_id` is
    /// within bounds of the array if debug asserts are disabled
    #[inline]
    pub unsafe fn set_unchecked(&mut self, node_id: usize, swhid: SWHID) {
        let bytes: [u8; SWHID::BYTES_SIZE] = swhid.into();
        let offset = node_id * SWHID::BYTES_SIZE;
        self.data
            .as_mut()
            .get_unchecked_mut(offset..offset + SWHID::BYTES_SIZE)
            .copy_from_slice(&bytes[..]);
    }

    /// Set a node_id to map to a given SWHID
    #[inline]
    pub fn set(&mut self, node_id: usize, swhid: SWHID) {
        let bytes: [u8; SWHID::BYTES_SIZE] = swhid.into();
        let offset = node_id * SWHID::BYTES_SIZE;
        self.data
            .as_mut()
            .get_mut(offset..offset + SWHID::BYTES_SIZE)
            .expect("Tried to write past the end of Node2SWHID map")
            .copy_from_slice(&bytes[..]);
    }
}

impl<B: AsRef<[u8]>> core::ops::Index<usize> for Node2SWHID<B> {
    type Output = SWHID;
    fn index(&self, index: usize) -> &Self::Output {
        let offset = index * SWHID::BYTES_SIZE;
        let bytes = &self.data.as_ref()[offset..offset + SWHID::BYTES_SIZE];
        debug_assert!(core::mem::size_of::<SWHID>() == SWHID::BYTES_SIZE);
        // unsafe :( but it's ok because SWHID does not depends on endianness
        // also TODO!: check for version
        unsafe { &*(bytes.as_ptr() as *const SWHID) }
    }
}