1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
// Copyright (C) 2024  The Software Heritage developers
// See the AUTHORS file at the top-level directory of this distribution
// License: GNU General Public License version 3, or any later version
// See top-level LICENSE file for more information

use std::fs::File;
use std::path::PathBuf;
use std::sync::Arc;

use anyhow::{Context, Result};

use arrow::datatypes::Schema;
use parquet::arrow::ArrowWriter as ParquetWriter;
use parquet::file::properties::WriterProperties;
use parquet::format::FileMetaData;

use super::{StructArrayBuilder, TableWriter};

/// Writer to a .parquet file, usable with [`ParallelDatasetWriter`](super::ParallelDatasetWriter)
///
/// `Builder` should follow the pattern documented by
/// [`arrow::builder`](https://docs.rs/arrow/latest/arrow/array/builder/index.html)
pub struct ParquetTableWriter<Builder: Default + StructArrayBuilder> {
    path: PathBuf,
    /// Automatically flushes the builder to disk when it length reaches the value.
    /// To avoid uneven row group sizes, this value plus the number of values added
    /// to the builder between calls to [`Self::builder`] should be either equal or
    /// equal to [`max_row_group_size`](WriterProperties::max_row_group_size)
    /// (or a multiple of it).
    pub flush_threshold: usize,
    file_writer: Option<ParquetWriter<File>>, // None only between .close() call and Drop
    builder: Builder,
}

impl<Builder: Default + StructArrayBuilder> TableWriter for ParquetTableWriter<Builder> {
    type Schema = (Arc<Schema>, WriterProperties);
    type CloseResult = FileMetaData;

    fn new(
        mut path: PathBuf,
        (schema, properties): Self::Schema,
        flush_threshold: Option<usize>,
    ) -> Result<Self> {
        path.set_extension("parquet");
        let file =
            File::create(&path).with_context(|| format!("Could not create {}", path.display()))?;
        let file_writer = ParquetWriter::try_new(file, schema.clone(), Some(properties.clone()))
            .with_context(|| {
                format!(
                    "Could not create writer for {} with schema {} and properties {:?}",
                    path.display(),
                    schema,
                    properties.clone()
                )
            })?;

        Ok(ParquetTableWriter {
            path,
            // See above, we need to make sure the user does not write more than
            // `properties.max_row_group_size()` minus `flush_threshold` rows between
            // two calls to self.builder() to avoid uneven group sizes. This seems
            // like a safe ratio.
            flush_threshold: flush_threshold.unwrap_or(properties.max_row_group_size() * 9 / 10),
            file_writer: Some(file_writer),
            builder: Builder::default(),
        })
    }

    fn flush(&mut self) -> Result<()> {
        // Get built array
        let mut tmp = Builder::default();
        std::mem::swap(&mut tmp, &mut self.builder);
        let struct_array = tmp.finish()?;

        let file_writer = self
            .file_writer
            .as_mut()
            .expect("File writer is unexpectedly None");

        // Write it
        file_writer
            .write(&struct_array.into())
            .with_context(|| format!("Could not write to {}", self.path.display()))?;
        file_writer
            .flush()
            .with_context(|| format!("Could not flush to {}", self.path.display()))
    }

    fn close(mut self) -> Result<FileMetaData> {
        self.flush()?;
        self.file_writer
            .take()
            .expect("File writer is unexpectedly None")
            .close()
            .with_context(|| format!("Could not close {}", self.path.display()))
    }
}

impl<Builder: Default + StructArrayBuilder> ParquetTableWriter<Builder> {
    /// Flushes the internal buffer is too large, then returns the array builder.
    pub fn builder(&mut self) -> Result<&mut Builder> {
        if self.builder.len() >= self.flush_threshold {
            self.flush()?;
        }

        Ok(&mut self.builder)
    }
}

impl<Builder: Default + StructArrayBuilder> Drop for ParquetTableWriter<Builder> {
    fn drop(&mut self) {
        if self.file_writer.is_some() {
            self.flush().unwrap();
            self.file_writer
                .take()
                .unwrap()
                .close()
                .with_context(|| format!("Could not close {}", self.path.display()))
                .unwrap();
        }
    }
}