1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
// Copyright (C) 2023  The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information

//! Structure to read the same data structures as `it.unimi.dsi.fastutil.chars.CharArrayFrontCodedList`,
//! which were used to store labels in graphs generated by the Java implementation of swh-graph.
//!
//! In newer graphs (generated by the Rust implementation of swh-graph), [`sux::dict::rear_coded_list`]
//! is used instead, as it is more space-efficient.

use std::fs::File;
use std::io::BufReader;
use std::path::Path;

use anyhow::{bail, Context, Result};
use mmap_rs::{Mmap, MmapFlags};

use crate::utils::{suffix_path, GetIndex};

#[derive(Debug, Clone)]
/// Front coded list, it takes a list of strings and encode them in a way that
/// the common prefix between strings is encoded only once.
///
/// The encoding is done in blocks of k strings, the first string is encoded
/// without compression, the other strings are encoded with the common prefix
/// removed.
///
/// See the
/// [`it.unimi.dsi.fastutil.bytes.ByteArrayFrontCodedBigList` documentation](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/bytes/ByteArrayFrontCodedBigList.html)
/// or the
/// [implementation](https://archive.softwareheritage.org/swh:1:cnt:2fc1092d2f792fcfcbf6ff9baf849f6d22e41486;origin=https://repo1.maven.org/maven2/it/unimi/dsi/fastutil;visit=swh:1:snp:8007412c404cf39fa38e3db600bdf93700410741;anchor=swh:1:rel:1ec2b63253f642eae54f1a3e5ddd20178867bc7d;path=/it/unimi/dsi/fastutil/bytes/ByteArrayFrontCodedList.java) for details
pub struct FrontCodedList<D: AsRef<[u8]>, P: AsRef<[u8]>> {
    /// The number of strings in a block, this regulates the compression vs
    /// decompression speed tradeoff
    k: usize,
    /// Number of encoded strings
    len: usize,
    /// The encoded bytestrings
    data: D,
    /// The pointer to in which byte the k-th string start
    pointers: P,
}

impl FrontCodedList<Mmap, Mmap> {
    pub fn load<P: AsRef<Path>>(base_path: P) -> Result<Self> {
        let properties_path = suffix_path(&base_path, ".properties");
        let bytearray_path = suffix_path(&base_path, ".bytearray");
        let pointers_path = suffix_path(&base_path, ".pointers");

        // Parse properties
        let properties_file = File::open(&properties_path)
            .with_context(|| format!("Could not open {}", properties_path.display()))?;
        let map = java_properties::read(BufReader::new(properties_file)).with_context(|| {
            format!(
                "Could not parse properties from {}",
                properties_path.display()
            )
        })?;
        let len =
            map.get("n").unwrap().parse::<usize>().with_context(|| {
                format!("Could not parse 'n' from {}", properties_path.display())
            })?;
        let k = map
            .get("ratio")
            .unwrap()
            .parse::<usize>()
            .with_context(|| {
                format!("Could not parse 'ratio' from {}", properties_path.display())
            })?;

        // mmap data
        let bytearray_len = bytearray_path
            .metadata()
            .with_context(|| format!("Could not read {} stats", bytearray_path.display()))?
            .len();
        let bytearray_file = std::fs::File::open(&bytearray_path)
            .with_context(|| format!("Could not open {}", bytearray_path.display()))?;
        let data = unsafe {
            mmap_rs::MmapOptions::new(bytearray_len as _)
                .context("Could not initialize mmap")?
                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES)
                .with_file(bytearray_file, 0)
                .map()
                .with_context(|| format!("Could not mmap {}", bytearray_path.display()))?
        };
        #[cfg(target_os = "linux")]
        unsafe {
            libc::madvise(data.as_ptr() as *mut _, data.len(), libc::MADV_RANDOM)
        };

        // mmap pointers
        let pointers_len = pointers_path
            .metadata()
            .with_context(|| format!("Could not read {} stats", pointers_path.display()))?
            .len();
        let expected_pointers_len = ((len.div_ceil(k)) * 8) as u64;
        if pointers_len != expected_pointers_len {
            bail!(
                "FCL at {} has length {} and ratio {} so {} should have length {}, but it has length {}",
                base_path.as_ref().display(),
                len,
                k,
                pointers_path.display(),
                expected_pointers_len,
                pointers_len
            );
        }
        let pointers_file = std::fs::File::open(&pointers_path)
            .with_context(|| format!("Could not open {}", pointers_path.display()))?;
        let pointers = unsafe {
            mmap_rs::MmapOptions::new(pointers_len as _)
                .context("Could not initialize mmap")?
                .with_flags(MmapFlags::TRANSPARENT_HUGE_PAGES)
                .with_file(pointers_file, 0)
                .map()
                .with_context(|| format!("Could not mmap {}", pointers_path.display()))?
        };
        #[cfg(target_os = "linux")]
        unsafe {
            libc::madvise(data.as_ptr() as *mut _, data.len(), libc::MADV_RANDOM)
        };

        Ok(FrontCodedList {
            k,
            len,
            data,
            pointers,
        })
    }
}

// Adapted from https://archive.softwareheritage.org/swh:1:cnt:08cf9306577d3948360afebfa77ee623edec7f1a;origin=https://github.com/vigna/sux-rs;visit=swh:1:snp:bed7ce7510f76c0b1e8fb995778028614bfff354;anchor=swh:1:rev:fac0e742d7a404237abca48e4aeffcde34f41e58;path=/src/dict/rear_coded_list.rs;lines=304-329
impl<D: AsRef<[u8]>, P: AsRef<[u8]>> FrontCodedList<D, P> {
    /// Write the index-th string to `result` as bytes. This is done to avoid
    /// allocating a new string for every query.
    #[inline(always)]
    pub fn get_inplace(&self, index: usize, result: &mut Vec<u8>) {
        result.clear();
        let block = index / self.k;
        let offset = index % self.k;

        let start = u64::from_be_bytes(
            self.pointers.as_ref()[block * 8..(block + 1) * 8]
                .try_into()
                .unwrap(),
        )
        .try_into()
        .expect("FCL pointer overflowed usize");
        let data = &self.data.as_ref()[start..];

        // decode the first string in the block
        let (len, mut data) = decode_int(data);
        result.extend(&data[..len as usize]);
        data = &data[len as usize..];

        for _ in 0..offset {
            let (new_suffix_len, tmp) = decode_int(data);
            let (reused_prefix_len, tmp) = decode_int(tmp);

            result.resize(reused_prefix_len as usize, 0);

            result.extend(&tmp[..new_suffix_len as usize]);

            data = &tmp[new_suffix_len as usize..];
        }
    }
}

impl<D: AsRef<[u8]>, P: AsRef<[u8]>> GetIndex for &FrontCodedList<D, P> {
    type Output = Vec<u8>;

    fn len(&self) -> usize {
        self.len
    }

    /// Returns the n-th bytestring, or `None` if `index` is larger than the length
    fn get(&self, index: usize) -> Option<Self::Output> {
        if index >= self.len {
            None
        } else {
            let mut result = Vec::with_capacity(128);
            self.get_inplace(index, &mut result);
            Some(result)
        }
    }

    /// Returns the n-th bytestring
    ///
    /// # Panics
    ///
    /// If `index` is out of bound
    unsafe fn get_unchecked(&self, index: usize) -> Self::Output {
        let mut result = Vec::with_capacity(128);
        self.get_inplace(index, &mut result);
        result
    }
}

#[inline(always)]
// Reads a varint at the beginning of the array, then returns the varint and the rest
// of the array.
//
// Adapted from https://archive.softwareheritage.org/swh:1:cnt:66c21893f9cd9686456b0127df0b9b48a0fe153d;origin=https://repo1.maven.org/maven2/it/unimi/dsi/fastutil;visit=swh:1:snp:8007412c404cf39fa38e3db600bdf93700410741;anchor=swh:1:rel:1ec2b63253f642eae54f1a3e5ddd20178867bc7d;path=/it/unimi/dsi/fastutil/bytes/ByteArrayFrontCodedList.java;lines=142-159
fn decode_int(data: &[u8]) -> (u32, &[u8]) {
    let high_bit_mask = 0b1000_0000u8;
    let invert = |n: u8| (!n) as u32;

    if data[0] & high_bit_mask == 0 {
        (data[0] as u32, &data[1..])
    } else if data[1] & high_bit_mask == 0 {
        ((invert(data[0]) << 7) | (data[1] as u32), &data[2..])
    } else if data[2] & high_bit_mask == 0 {
        (
            ((invert(data[0])) << 14) | (invert(data[1]) << 7) | (data[2] as u32),
            &data[3..],
        )
    } else if data[3] & high_bit_mask == 0 {
        (
            (invert(data[0]) << 21)
                | (invert(data[1]) << 14)
                | (invert(data[2]) << 7)
                | (data[3] as u32),
            &data[4..],
        )
    } else {
        (
            ((invert(data[0])) << 28)
                | (invert(data[1]) << 21)
                | (invert(data[2]) << 14)
                | (invert(data[3]) << 7)
                | (data[4] as u32),
            &data[5..],
        )
    }
}