Protect Your Data & ML Products from Low-Quality Data

The open-source framework for precision data testing for data scientists and ML engineers.

Install Pandera & get started

Copied to clipboard!

$ pip install pandera

Build confidence in the quality of your data by defining schemas for complex data objects

Pandera provides a simple, flexible and extensible data-testing framework for validating not only your data, but also the functions that produce them.

Copied to clipboard!

import pandas as pd
import pandera as pa

from pandera.typing import Series, DataFrame

# Define a schema
class Schema(pa.SchemaModel):
	item: Series[str] = pa.Field(isin=["apple", "orange"], coerce=True)
	price: Series[float] = pa.Field(gt=0, coerce=True)

# Validate at runtime
@pa.check_types(lazy=True)
def transform_data(data: DataFrame [Schema]):
	...

transform_data(
	pd.DataFrame.from_records([
		{"item": "applee", "price": 0.5}, 	# invalid item name
		{"item": "orange", "price": -1000}, 	# negative price
	])
)

Copied to clipboard!

import hypothesis
import pandera as pa

from pandera.typing import Series, DataFrame

# Define an input schema
class Schema(pa.SchemaModel):
	item: Series[str] = pa.Field(isin= ["apple", "orange"], coerce=True)
	price: Series[float] = pa.Field(gt=0, coerce=True)

# Define an output schema
class OutputSchema(Schema) :
	item: Series[str] = pa.Field(isin=[ "apple"])

# Implement a function that filters out oranges
@pa.check_types (lazy=True)
def transform_data(data: DataFrame[Schema]) -> DataFrame [OutputSchema]:
	return data.query("item =='orange'") # 🐛 Incorrect implementation

# Test the function
@hypothesis.given(Schema.strategy(size=10))
def test_transform_data(data):
	transform_data(data)

# Run Unit Tests
test_transform_data()