cpg

Creating Polars dataframes from Rust objects

2023-05-28 #rust

Easy creation of dataframes using serde_arrow

Polars is a very nice Rust DataFrame library, with a Python API, which can be used as a replacement for pandas.

The problem

There are a couple of discussions online on how to easily and efficiently create polars DataFrames from a sequence of Rust objects, say Vec<T> or Iterator<Item=T>, where each item corresponds to a row:

Solution with serde_arrow

Here is a possible solution, using the serde_arrow crate.

It can be used as follows:

use serde::Serialize;
#[derive(Serialize, Default)]
struct Characters {
    name: String,
    age: u8,
}
fn main() -> anyhow::Result<()> {
    let builder = DataFrameBuilder::<Characters>::new()?;
    builder.extend([Characters {
        name: "Goofy".to_string(),
        age: 30,
    }])?;
    let df = builder.finalize()?;
    Ok(())
}
 

The idea is to:

use std::sync::{Arc, Mutex};
 
use arrow2::datatypes::DataType as ArrowDataType;
use polars::prelude::*;
use serde::Serialize;
use serde_arrow::arrow2::ArraysBuilder;
 
pub struct DataFrameBuilder<T> {
    builder: Arc<Mutex<ArraysBuilder>>,
    fields: Vec<arrow2::datatypes::Field>,
    t: std::marker::PhantomData<T>,
}
 
impl<T: Serialize + Default> DataFrameBuilder<T> {
    pub fn new() -> Result<Self, Error> {
        let fields = serde_arrow::arrow2::serialize_into_fields(
            &vec![T::default()],
            Default::default(),
        )?;
 
        let builder = ArraysBuilder::new(&fields)?;
 
        Ok(Self {
            fields,
            builder: Arc::new(Mutex::new(builder)),
            t: std::marker::PhantomData,
        })
    }
    pub fn extend(
        &self,
        data: impl IntoIterator<Item = T>,
    ) -> Result<(), Error> {
        let mut builder = self.builder.lock().unwrap();
        for data in data {
            builder.push(&data).map_err(|_| Error::Serialization)?;
        }
        Ok(())
    }
    pub fn finalize(self) -> Result<DataFrame, Error> {
        let mut builder = self.builder.lock().unwrap();
        let arrays = builder.build_arrays()?;
        let mut series = vec![];
        for (f, array) in self.fields.iter().zip(arrays) {
            let data_type = match f.data_type {
                ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => {
                    DataType::Utf8
                }
                ArrowDataType::UInt8 => DataType::UInt8,
                ArrowDataType::UInt16 => DataType::UInt16,
                ArrowDataType::UInt32 => DataType::UInt32,
                ArrowDataType::UInt64 => DataType::UInt64,
                ArrowDataType::Int8 => DataType::Int8,
                ArrowDataType::Int16 => DataType::Int16,
                ArrowDataType::Int32 => DataType::Int32,
                ArrowDataType::Int64 => DataType::Int64,
                ArrowDataType::Float32 => DataType::Float32,
                ArrowDataType::Float64 => DataType::Float64,
                ArrowDataType::Boolean => DataType::Boolean,
                _ => {
                    return Err(Error::UnsupportedType(f.name.clone()));
                }
            };
            // This is unsafe but the most efficient way to create the series from the
            // arrow data.
            // Unless something has gone very wrong, the data type is ensured to be
            // correct.
            unsafe {
                series.push(Series::from_chunks_and_dtype_unchecked(
                    &f.name,
                    vec![array],
                    &data_type,
                ));
            }
        }
        Ok(DataFrame::new(series)?)
    }
}
 

Note that a bit more code is required in new if the scheme cannot be deducted from the default value only (e.g. enums).