1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
// Copyright (C) 2023-2024  The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information

use anyhow::{Context, Result};
use mmap_rs::Mmap;
use thiserror::Error;

use super::suffixes::*;
use super::*;
use crate::java_compat::fcl::FrontCodedList;
use crate::labels::FilenameId;
use crate::utils::suffix_path;

/// Trait implemented by both [`NoLabelNames`] and all implementors of [`LabelNames`],
/// to allow loading label names only if needed.
pub trait MaybeLabelNames {}

pub struct MappedLabelNames {
    label_names: FrontCodedList<Mmap, Mmap>,
}
impl<L: LabelNames> MaybeLabelNames for L {}

/// Placeholder for when label names are not loaded.
pub struct NoLabelNames;
impl MaybeLabelNames for NoLabelNames {}

/// Trait for backend storage of label names (either in-memory or memory-mapped)
pub trait LabelNames {
    type LabelNames<'a>: GetIndex<Output = Vec<u8>>
    where
        Self: 'a;

    fn label_names(&self) -> Self::LabelNames<'_>;
}

impl LabelNames for MappedLabelNames {
    type LabelNames<'a> = &'a FrontCodedList<Mmap, Mmap> where Self: 'a;

    #[inline(always)]
    fn label_names(&self) -> Self::LabelNames<'_> {
        &self.label_names
    }
}

pub struct VecLabelNames {
    label_names: Vec<Vec<u8>>,
}
impl VecLabelNames {
    /// Returns a new `VecLabelNames` from a list of label names
    pub fn new<S: AsRef<[u8]>>(label_names: Vec<S>) -> Result<Self> {
        let base64 = base64_simd::STANDARD;
        Ok(VecLabelNames {
            label_names: label_names
                .into_iter()
                .map(|s| base64.encode_to_string(s).into_bytes())
                .collect(),
        })
    }
}

impl LabelNames for VecLabelNames {
    type LabelNames<'a> = &'a [Vec<u8>] where Self: 'a;

    #[inline(always)]
    fn label_names(&self) -> Self::LabelNames<'_> {
        self.label_names.as_slice()
    }
}

impl<
        MAPS: MaybeMaps,
        TIMESTAMPS: MaybeTimestamps,
        PERSONS: MaybePersons,
        CONTENTS: MaybeContents,
        STRINGS: MaybeStrings,
    > SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, NoLabelNames>
{
    /// Consumes a [`SwhGraphProperties`] and returns a new one with this methods
    /// available:
    ///
    /// * [`SwhGraphProperties::label_name`]
    /// * [`SwhGraphProperties::label_name_base64`]
    pub fn load_label_names(
        self,
    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, MappedLabelNames>>
    {
        let label_names = MappedLabelNames {
            label_names: FrontCodedList::load(suffix_path(&self.path, LABEL_NAME))
                .context("Could not load label names")?,
        };
        self.with_label_names(label_names)
    }

    /// Alternative to [`load_label_names`](Self::load_label_names) that allows using
    /// arbitrary label_names implementations
    pub fn with_label_names<LABELNAMES: LabelNames>(
        self,
        label_names: LABELNAMES,
    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>> {
        Ok(SwhGraphProperties {
            maps: self.maps,
            timestamps: self.timestamps,
            persons: self.persons,
            contents: self.contents,
            strings: self.strings,
            label_names,
            path: self.path,
            num_nodes: self.num_nodes,
        })
    }
}

/// Functions to access names of arc labels.
///
/// Only available after calling [`load_label_names`](SwhGraphProperties::load_label_names)
/// or [`load_all_properties`](crate::graph::SwhBidirectionalGraph::load_all_properties).
impl<
        MAPS: MaybeMaps,
        TIMESTAMPS: MaybeTimestamps,
        PERSONS: MaybePersons,
        CONTENTS: MaybeContents,
        STRINGS: MaybeStrings,
        LABELNAMES: LabelNames,
    > SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
{
    /// Returns the base64-encoded name of an arc label
    ///
    /// This is the file name (resp. branch name) of a label on an arc from a directory
    /// (resp. snapshot)
    ///
    /// # Panics
    ///
    /// If `filename_id` does not exist
    #[inline]
    pub fn label_name_base64(&self, filename_id: FilenameId) -> Vec<u8> {
        self.try_label_name_base64(filename_id)
            .unwrap_or_else(|e| panic!("Cannot get label name: {}", e))
    }

    /// Returns the base64-encoded name of an arc label
    ///
    /// This is the file name (resp. branch name) of a label on an arc from a directory
    /// (resp. snapshot)
    #[inline]
    pub fn try_label_name_base64(
        &self,
        filename_id: FilenameId,
    ) -> Result<Vec<u8>, OutOfBoundError> {
        let index = filename_id
            .0
            .try_into()
            .expect("filename_id overflowed usize");
        self.label_names
            .label_names()
            .get(index)
            .ok_or(OutOfBoundError {
                index,
                len: self.label_names.label_names().len(),
            })
    }

    /// Returns the name of an arc label
    ///
    /// This is the file name (resp. branch name) of a label on an arc from a directory
    /// (resp. snapshot)
    ///
    /// # Panics
    ///
    /// If `filename_id` does not exist
    #[inline]
    pub fn label_name(&self, filename_id: FilenameId) -> Vec<u8> {
        self.try_label_name(filename_id)
            .unwrap_or_else(|e| panic!("Cannot get label name: {}", e))
    }

    /// Returns the name of an arc label
    ///
    /// This is the file name (resp. branch name) of a label on an arc from a directory
    /// (resp. snapshot)
    #[inline]
    pub fn try_label_name(&self, filename_id: FilenameId) -> Result<Vec<u8>, OutOfBoundError> {
        let base64 = base64_simd::STANDARD;
        self.try_label_name_base64(filename_id).map(|name| {
            base64.decode_to_vec(name).unwrap_or_else(|name| {
                panic!(
                    "Could not decode filename of id {}: {:?}",
                    filename_id.0, name
                )
            })
        })
    }

    /// Given a branch/file name, returns the filename id used by edges with that name,
    /// or `None` if it does not exist.
    ///
    /// This is the inverse function of [`label_name`](Self::label_name)
    ///
    /// Unlike in Java where this function is `O(1)`, this implementation is `O(log2(num_labels))`
    /// because it uses a binary search, as the MPH function can only be read from Java.
    #[inline]
    pub fn label_name_id(
        &self,
        name: impl AsRef<[u8]>,
    ) -> Result<FilenameId, LabelIdFromNameError> {
        use std::cmp::Ordering::*;
        let base64 = base64_simd::STANDARD;
        let name = base64.encode_to_string(name.as_ref()).into_bytes();

        // both inclusive
        let mut min = 0;
        let mut max = self
            .label_names
            .label_names()
            .len()
            .saturating_sub(1)
            .try_into()
            .expect("number of labels overflowed u64");

        while min <= max {
            let pivot = (min + max) / 2;
            let pivot_id = FilenameId(pivot);
            let pivot_name = self.label_name_base64(pivot_id);
            if min == max {
                if pivot_name.as_slice() == name {
                    return Ok(pivot_id);
                } else {
                    break;
                }
            } else {
                match pivot_name.as_slice().cmp(&name) {
                    Less => min = pivot.saturating_add(1),
                    Equal => return Ok(pivot_id),
                    Greater => max = pivot.saturating_sub(1),
                }
            }
        }

        Err(LabelIdFromNameError(name.to_vec()))
    }
}

#[derive(Clone, Debug, Eq, PartialEq, Error)]
#[error("Unknown label name: {}", String::from_utf8_lossy(.0))]
pub struct LabelIdFromNameError(pub Vec<u8>);