Extracting directly to a DataFrame¶
In this example we'll show you how to extract directly to a pandas.DataFrame
from io import StringIO
from typing import Annotated, Any
from pydantic import (
BaseModel,
BeforeValidator,
PlainSerializer,
InstanceOf,
WithJsonSchema,
)
import pandas as pd
import instructor
import openai
def md_to_df(data: Any) -> Any:
# Convert markdown to DataFrame
if isinstance(data, str):
return (
pd.read_csv(
StringIO(data), # Process data
sep="|",
index_col=1,
)
.dropna(axis=1, how="all")
.iloc[1:]
.applymap(lambda x: x.strip())
)
return data
MarkdownDataFrame = Annotated[
# Validates final type
InstanceOf[pd.DataFrame],
# Converts markdown to DataFrame
BeforeValidator(md_to_df),
# Converts DataFrame to markdown on model_dump_json
PlainSerializer(lambda df: df.to_markdown()),
# Adds a description to the type
WithJsonSchema(
{
"type": "string",
"description": """
The markdown representation of the table,
each one should be tidy, do not try to join
tables that should be seperate""",
}
),
]
client = instructor.from_openai(openai.OpenAI())
def extract_df(data: str) -> pd.DataFrame:
return client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=MarkdownDataFrame,
messages=[
{
"role": "system",
"content": "You are a data extraction system, table of writing perfectly formatted markdown tables.",
},
{
"role": "user",
"content": f"Extract the data into a table: {data}",
},
],
)
class Table(BaseModel):
title: str
data: MarkdownDataFrame
def extract_table(data: str) -> Table:
return client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=Table,
messages=[
{
"role": "system",
"content": "You are a data extraction system, table of writing perfectly formatted markdown tables.",
},
{
"role": "user",
"content": f"Extract the data into a table: {data}",
},
],
)
if __name__ == "__main__":
df = extract_df(
"""Create a table of the last 5 presidents of the United States,
including their party and the years they served."""
)
assert isinstance(df, pd.DataFrame)
print(df)
"""
Party Years Served
President
Joe Biden Democrat 2021 - Present
Donald Trump Republican 2017 - 2021
Barack Obama Democrat 2009 - 2017
George W. Bush Republican 2001 - 2009
Bill Clinton Democrat 1993 - 2001
"""
table = extract_table(
"""Create a table of the last 5 presidents of the United States,
including their party and the years they served."""
)
assert isinstance(table, Table)
assert isinstance(table.data, pd.DataFrame)
print(table.title)
#> Last 5 Presidents of the United States
print(table.data)
"""
Party Years Served
President
Joe Biden Democratic 2021-2025
Donald Trump Republican 2017-2021
Barack Obama Democratic 2009-2017
George W. Bush Republican 2001-2009
Bill Clinton Democratic 1993-2001
"""
Notice that you can extract both the raw MarkdownDataFrame
or a more complex structure like Table
which includes a title and the data as a DataFrame. You can even request Iterable[Table]
to get multiple tables in a single response!