1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
// Copyright (C) 2023  The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information

use std::io;
use std::path::PathBuf;
use std::sync::Mutex;

use anyhow::{Context, Result};
use clap::Parser;
use dsi_progress_logger::{ProgressLog, ProgressLogger};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};

use swh_graph::graph::*;
use swh_graph::java_compat::mph::gov::GOVMPH;
use swh_graph::SWHID;

use swh_graph_provenance::earliest_revision::{find_earliest_revision, EarliestRevision};

#[derive(Parser, Debug)]
#[command(about = "Given a list of directory/content SWHID on stdin, returns a CSV with header 'swhid,earliest_swhid,earliest_ts,rev_occurrences'", long_about = None)]
struct Args {
    graph_path: PathBuf,
    #[arg(short, long, action = clap::ArgAction::Count)]
    verbose: u8,
}

#[derive(Debug, Deserialize)]
struct InputRecord {
    swhid: String,
}

#[derive(Debug, Serialize)]
struct OutputRecord {
    swhid: String,
    earliest_swhid: SWHID,
    earliest_ts: i64,
    rev_occurrences: u64,
}

pub fn main() -> Result<()> {
    let args = Args::parse();

    stderrlog::new()
        .verbosity(args.verbose as usize)
        .timestamp(stderrlog::Timestamp::Second)
        .init()
        .context("While Initializing the stderrlog")?;

    log::info!("Loading graph");
    let graph = swh_graph::graph::load_bidirectional(args.graph_path)
        .context("Could not load graph")?
        .init_properties()
        .load_properties(|props| props.load_maps::<GOVMPH>())
        .context("Could not load maps")?
        .load_properties(|props| props.load_timestamps())
        .context("Could not load timestamps")?;

    let mut reader = csv::ReaderBuilder::new()
        .has_headers(true)
        .from_reader(io::stdin());
    let writer = csv::WriterBuilder::new()
        .has_headers(true)
        .from_writer(io::stdout());
    let writer = Mutex::new(writer);

    let mut pl = ProgressLogger::default();
    pl.item_name("SWHID");
    pl.display_memory(true);
    pl.start("Looking up SWHID provenance...");
    let pl = Mutex::new(pl);

    reader.deserialize().par_bridge().try_for_each(|record| {
        let InputRecord { swhid } = record.context("Could not deserialize input")?;

        let node = graph.properties().node_id_from_string_swhid(&swhid)?;
        match find_earliest_revision(&graph, node) {
            Some(EarliestRevision {
                node: earliest_rev_id,
                ts: earliest_ts,
                rev_occurrences,
            }) => {
                let earliest_swhid = graph.properties().swhid(earliest_rev_id);
                let record = OutputRecord {
                    swhid,
                    earliest_swhid,
                    earliest_ts,
                    rev_occurrences,
                };
                writer
                    .lock()
                    .unwrap()
                    .serialize(record)
                    .context("Could not write record")?
            }
            None => log::debug!("no revision found containing {swhid}"),
        }
        pl.lock().unwrap().light_update();
        Ok::<(), anyhow::Error>(())
    })?;
    pl.lock().unwrap().done();

    Ok(())
}