1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
/*
 *
 * SPDX-FileCopyrightText: 2023 Tommaso Fontana
 * SPDX-FileCopyrightText: 2023 Inria
 * SPDX-FileCopyrightText: 2023 Sebastiano Vigna
 *
 * SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
 */

//! Ported from <https://github.com/vigna/Sux4J/blob/master/c/sf3.c>

use crate::java_compat::mph::spooky::{spooky_short, spooky_short_rehash};
use anyhow::Result;
use std::fs::File;
use std::io::BufReader;
use std::io::Read;
use std::path::Path;

/// A stub to load and access Genuzio-Ottaviano-Vigna static functions.
///
/// To generate the structure you must use the Java version:
/// ```bash
/// java it.unimi.dsi.sux4j.mph.GOV3Function --byte-array SOURCE test.sf
/// ```
///
/// To obtain a file that can be read by this structure, load
/// the serialized Java instance of the static function and
/// dump it in C-compatible format:
/// ```shell
/// echo '((it.unimi.dsi.sux4j.mph.GOV3Function)it.unimi.dsi.fastutil.io.BinIO.loadObject("test.sf")).dump("test.csf");' | jshell
/// ```
///
/// You can now load the dumped file with the [`load`](GOV3::load) method.
///
/// # Reference:
/// - [Marco Genuzio, Giuseppe Ottaviano, and Sebastiano Vigna, Fast Scalable Construction of (Minimal Perfect Hash) Functions](https://arxiv.org/pdf/1603.04330.pdf)
/// - [Java version with `dump` method](https://github.com/vigna/Sux4J/blob/master/src/it/unimi/dsi/sux4j/mph/GOV3Function.java)

#[derive(Debug)]
pub struct GOV3 {
    pub size: u64,
    pub width: u64,
    pub multiplier: u64,
    pub global_seed: u64,
    pub offset_and_seed: Vec<u64>,
    pub array: Vec<u64>,
}

impl GOV3 {
    /// Given a path to a file `.csf3` generated from the java version,
    /// load a GOV3 structure from a file
    pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
        Self::load_reader(BufReader::new(File::open(path.as_ref())?))
    }

    /// Given a generic `Read` implementor, load a GOV3 structure from a file.
    pub fn load_reader<F: Read>(mut file: F) -> Result<Self> {
        macro_rules! read {
            ($file:expr, $type:ty) => {{
                let mut buffer: [u8; core::mem::size_of::<$type>()] =
                    [0; core::mem::size_of::<$type>()];
                $file.read_exact(&mut buffer)?;
                <$type>::from_le_bytes(buffer)
            }};
        }

        macro_rules! read_array {
            ($file:expr, $type:ty) => {{
                // create a bytes buffer big enough for $len elements of type $type
                let len = read!($file, u64) as usize;
                let bytes = len * core::mem::size_of::<$type>();
                let mut buffer: Vec<u8> = vec![0; bytes];
                // read the file in the buffer
                $file.read_exact(&mut buffer)?;
                // convert the buffer Vec<u8> into a Vec<$type>
                let ptr = buffer.as_mut_ptr();
                core::mem::forget(buffer);
                unsafe { Vec::from_raw_parts(ptr as *mut $type, len, len) }
            }};
        }
        // actually load the data :)
        let size = read!(file, u64);
        let width = read!(file, u64);
        let multiplier = read!(file, u64);
        let global_seed = read!(file, u64);
        let offset_and_seed = read_array!(file, u64);
        let array = read_array!(file, u64);

        Ok(Self {
            size,
            width,
            multiplier,
            global_seed,
            offset_and_seed,
            array,
        })
    }
}

impl GOV3 {
    pub fn size(&self) -> u64 {
        self.size
    }

    pub fn get_byte_array(&self, key: &[u8]) -> u64 {
        let signature = spooky_short(key, self.global_seed);
        let bucket = ((((signature[0] as u128) >> 1) * (self.multiplier as u128)) >> 64) as u64;
        let offset_seed = self.offset_and_seed[bucket as usize];
        let bucket_offset = offset_seed & OFFSET_MASK;
        let num_variables =
            (self.offset_and_seed[bucket as usize + 1] & OFFSET_MASK) - bucket_offset;
        let e = signature_to_equation(&signature, offset_seed & (!OFFSET_MASK), num_variables);
        get_value(&self.array, e[0] + bucket_offset, self.width)
            ^ get_value(&self.array, e[1] + bucket_offset, self.width)
            ^ get_value(&self.array, e[2] + bucket_offset, self.width)
    }
}

const OFFSET_MASK: u64 = u64::MAX >> 8;

#[inline(always)]
#[must_use]
fn signature_to_equation(signature: &[u64; 4], seed: u64, num_variables: u64) -> [u64; 3] {
    let hash = spooky_short_rehash(signature, seed);
    let shift = num_variables.leading_zeros();
    let mask = (1_u64 << shift) - 1;
    [
        ((hash[0] & mask) * num_variables) >> shift,
        ((hash[1] & mask) * num_variables) >> shift,
        ((hash[2] & mask) * num_variables) >> shift,
    ]
}

#[inline(always)]
#[must_use]
fn get_value(array: &[u64], mut pos: u64, width: u64) -> u64 {
    pos *= width;
    let l = 64 - width;
    let start_word = pos / 64;
    let start_bit = pos % 64;
    if start_bit <= l {
        (array[start_word as usize] << (l - start_bit)) >> l
    } else {
        (array[start_word as usize] >> start_bit)
            | ((array[start_word as usize + 1] << (64 + l - start_bit)) >> l)
    }
}