refactor: 🚀 switch core data ops from pandas to polars (training, benchmarks, plotting, W&B) (#21619)

Signed-off-by: Onuralp SEZER <onuralp@ultralytics.com>
Co-authored-by: UltralyticsAssistant <web@ultralytics.com>
Co-authored-by: Onuralp SEZER <onuralp@ultralytics.com>
This commit is contained in:
Jing Qiu 2025-08-26 13:22:44 +08:00 committed by GitHub
parent 676a8bbb82
commit bcb1961a9b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 71 additions and 159 deletions

View file

@ -516,12 +516,9 @@ All Ultralytics `predict()` calls will return a list of `Results` objects:
| `save_txt()` | `str` | Saves detection results to a text file and returns the path to the saved file. |
| `save_crop()` | `None` | Saves cropped detection images to specified directory. |
| `summary()` | `List[Dict[str, Any]]` | Converts inference results to a summarized dictionary with optional normalization. |
| `to_df()` | `DataFrame` | Converts detection results to a Pandas DataFrame. |
| `to_df()` | `DataFrame` | Converts detection results to a Polars DataFrame. |
| `to_csv()` | `str` | Converts detection results to CSV format. |
| `to_xml()` | `str` | Converts detection results to XML format. |
| `to_html()` | `str` | Converts detection results to HTML format. |
| `to_json()` | `str` | Converts detection results to JSON format. |
| `to_sql()` | `None` | Converts detection results to SQL-compatible format and saves to database. |
For more details see the [`Results` class documentation](../reference/engine/results.md).

View file

@ -93,7 +93,7 @@ Each of these settings plays a vital role in the validation process, allowing fo
allowfullscreen>
</iframe>
<br>
<strong>Watch:</strong> How to Export Model Validation Results in CSV, JSON, SQL, Pandas DataFrame & More
<strong>Watch:</strong> How to Export Model Validation Results in CSV, JSON, SQL, Polars DataFrame & More
</p>
<a href="https://github.com/ultralytics/notebooks/blob/main/notebooks/how-to-export-the-validation-results-into-dataframe-csv-sql-and-other-formats.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Explore model validation and different export methods in Google Colab"></a>
@ -133,15 +133,12 @@ The below examples showcase YOLO model validation with custom arguments in Pytho
print(results.confusion_matrix.to_df())
```
| Method | Return Type | Description |
| ----------- | ---------------------- | -------------------------------------------------------------------------------- |
| `summary()` | `List[Dict[str, Any]]` | Converts validation results to a summarized dictionary. |
| `to_df()` | `DataFrame` | Returns the validation results as a structured Pandas DataFrame. |
| `to_csv()` | `str` | Exports the validation results in CSV format and returns the CSV string. |
| `to_xml()` | `str` | Exports the validation results in XML format and returns the XML string. |
| `to_html()` | `str` | Exports the validation results in HTML table format and returns the HTML string. |
| `to_json()` | `str` | Exports the validation results in JSON format and returns the JSON string. |
| `to_sql()` | `str` | Exports the validation results in SQl database. |
| Method | Return Type | Description |
| ----------- | ---------------------- | -------------------------------------------------------------------------- |
| `summary()` | `List[Dict[str, Any]]` | Converts validation results to a summarized dictionary. |
| `to_df()` | `DataFrame` | Returns the validation results as a structured Polars DataFrame. |
| `to_csv()` | `str` | Exports the validation results in CSV format and returns the CSV string. |
| `to_json()` | `str` | Exports the validation results in JSON format and returns the JSON string. |
For more details see the [`DataExportMixin` class documentation](../reference/utils/__init__.md/#ultralytics.utils.DataExportMixin).

View file

@ -167,7 +167,7 @@ While the standard installation methods cover most use cases, you might need a m
2. **Manually install dependencies:** You need to install all required packages listed in the `pyproject.toml` file, substituting or modifying versions as needed. For the headless OpenCV example:
```bash
# Install other core dependencies
pip install torch torchvision numpy matplotlib pandas pyyaml pillow psutil requests scipy seaborn ultralytics-thop
pip install torch torchvision numpy matplotlib polars pyyaml pillow psutil requests scipy seaborn ultralytics-thop
# Install headless OpenCV instead of the default
pip install opencv-python-headless
@ -237,7 +237,7 @@ While the standard installation methods cover most use cases, you might need a m
# Core dependencies
numpy
matplotlib
pandas
polars
pyyaml
Pillow
psutil

View file

@ -73,7 +73,7 @@ dependencies = [
"torchvision>=0.9.0",
"psutil", # system utilization
"py-cpuinfo", # display CPU info
"pandas>=1.1.4",
"polars",
"ultralytics-thop>=2.0.0", # FLOPs computation https://github.com/ultralytics/thop
]
@ -118,7 +118,6 @@ extra = [
"faster-coco-eval>=1.6.7", # COCO mAP
]
typing = [
"pandas-stubs",
"scipy-stubs",
"types-pillow",
"types-psutil",

View file

@ -209,16 +209,11 @@ def test_val(task: str, weight: str, data: str) -> None:
metrics = model.val(data=data, imgsz=32, plots=plots)
metrics.to_df()
metrics.to_csv()
metrics.to_xml()
metrics.to_html()
metrics.to_json()
metrics.to_sql()
metrics.confusion_matrix.to_df() # Tests for confusion matrix export
# Tests for confusion matrix export
metrics.confusion_matrix.to_df()
metrics.confusion_matrix.to_csv()
metrics.confusion_matrix.to_xml()
metrics.confusion_matrix.to_html()
metrics.confusion_matrix.to_json()
metrics.confusion_matrix.to_sql()
def test_train_scratch():
@ -304,10 +299,7 @@ def test_results(model: str):
r.save_crop(save_dir=TMP / "runs/tests/crops/")
r.to_df(decimals=3) # Align to_ methods: https://docs.ultralytics.com/modes/predict/#working-with-results
r.to_csv()
r.to_xml()
r.to_html()
r.to_json(normalize=True)
r.to_sql()
r.plot(pil=True, save=True, filename=TMP / "results_plot_save.jpg")
r.plot(conf=True, boxes=True)
print(r, len(r), r.path) # print after methods

View file

@ -24,7 +24,7 @@ download: |
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl
from ultralytics.utils import TQDM
from ultralytics.utils.downloads import download
@ -45,7 +45,7 @@ download: |
# Convert labels
names = "image", "x1", "y1", "x2", "y2", "class", "image_width", "image_height" # column names
for d in "annotations_train.csv", "annotations_val.csv", "annotations_test.csv":
x = pd.read_csv(dir / "annotations" / d, names=names).values # annotations
x = pl.read_csv(dir / "annotations" / d, names=names).to_numpy() # annotations
images, unique_images = x[:, 0], np.unique(x[:, 0])
with open((dir / d).with_suffix(".txt").__str__().replace("annotations_", ""), "w", encoding="utf-8") as f:
f.writelines(f"./images/{s}\n" for s in unique_images)

View file

@ -222,12 +222,9 @@ class Results(SimpleClass, DataExportMixin):
save_txt: Save detection results to a text file.
save_crop: Save cropped detection images to specified directory.
summary: Convert inference results to a summarized dictionary.
to_df: Convert detection results to a Pandas Dataframe.
to_df: Convert detection results to a Polars Dataframe.
to_json: Convert detection results to JSON format.
to_csv: Convert detection results to a CSV format.
to_xml: Convert detection results to XML format.
to_html: Convert detection results to HTML format.
to_sql: Convert detection results to an SQL-compatible format.
Examples:
>>> results = model("path/to/image.jpg")

View file

@ -540,10 +540,10 @@ class BaseTrainer:
torch.cuda.empty_cache()
def read_results_csv(self):
"""Read results.csv into a dictionary using pandas."""
import pandas as pd # scope for faster 'import ultralytics'
"""Read results.csv into a dictionary using polars."""
import polars as pl # scope for faster 'import ultralytics'
return pd.read_csv(self.csv).to_dict(orient="list")
return pl.read_csv(self.csv).to_dict(as_series=False)
def _model_train(self):
"""Set model in training mode."""

View file

@ -134,17 +134,14 @@ class DataExportMixin:
Mixin class for exporting validation metrics or prediction results in various formats.
This class provides utilities to export performance metrics (e.g., mAP, precision, recall) or prediction results
from classification, object detection, segmentation, or pose estimation tasks into various formats: Pandas
DataFrame, CSV, XML, HTML, JSON and SQLite (SQL).
from classification, object detection, segmentation, or pose estimation tasks into various formats: Polars
DataFrame, CSV and JSON.
Methods:
to_df: Convert summary to a Pandas DataFrame.
to_df: Convert summary to a Polars DataFrame.
to_csv: Export results as a CSV string.
to_xml: Export results as an XML string (requires `lxml`).
to_html: Export results as an HTML table.
to_json: Export results as a JSON string.
tojson: Deprecated alias for `to_json()`.
to_sql: Export results to an SQLite database.
Examples:
>>> model = YOLO("yolo11n.pt")
@ -152,12 +149,11 @@ class DataExportMixin:
>>> df = results.to_df()
>>> print(df)
>>> csv_data = results.to_csv()
>>> results.to_sql(table_name="yolo_results")
"""
def to_df(self, normalize=False, decimals=5):
"""
Create a pandas DataFrame from the prediction results summary or validation metrics.
Create a polars DataFrame from the prediction results summary or validation metrics.
Args:
normalize (bool, optional): Normalize numerical values for easier comparison.
@ -166,13 +162,13 @@ class DataExportMixin:
Returns:
(DataFrame): DataFrame containing the summary data.
"""
import pandas as pd # scope for faster 'import ultralytics'
import polars as pl # scope for faster 'import ultralytics'
return pd.DataFrame(self.summary(normalize=normalize, decimals=decimals))
return pl.DataFrame(self.summary(normalize=normalize, decimals=decimals))
def to_csv(self, normalize=False, decimals=5):
"""
Export results to CSV string format.
Export results or metrics to CSV string format.
Args:
normalize (bool, optional): Normalize numeric values.
@ -181,44 +177,25 @@ class DataExportMixin:
Returns:
(str): CSV content as string.
"""
return self.to_df(normalize=normalize, decimals=decimals).to_csv()
import polars as pl
def to_xml(self, normalize=False, decimals=5):
"""
Export results to XML format.
Args:
normalize (bool, optional): Normalize numeric values.
decimals (int, optional): Decimal precision.
Returns:
(str): XML string.
Notes:
Requires `lxml` package to be installed.
"""
df = self.to_df(normalize=normalize, decimals=decimals)
return '<?xml version="1.0" encoding="utf-8"?>\n<root></root>' if df.empty else df.to_xml(parser="etree")
def to_html(self, normalize=False, decimals=5, index=False):
"""
Export results to HTML table format.
try:
return df.write_csv()
except Exception:
# Minimal string conversion for any remaining complex types
def _to_str_simple(v):
if v is None:
return ""
if isinstance(v, (dict, list, tuple, set)):
return repr(v)
return str(v)
Args:
normalize (bool, optional): Normalize numeric values.
decimals (int, optional): Decimal precision.
index (bool, optional): Whether to include index column in the HTML table.
Returns:
(str): HTML representation of the results.
"""
df = self.to_df(normalize=normalize, decimals=decimals)
return "<table></table>" if df.empty else df.to_html(index=index)
def tojson(self, normalize=False, decimals=5):
"""Deprecated version of to_json()."""
LOGGER.warning("'result.tojson()' is deprecated, replace with 'result.to_json()'.")
return self.to_json(normalize, decimals)
df_str = df.select(
[pl.col(c).map_elements(_to_str_simple, return_dtype=pl.String).alias(c) for c in df.columns]
)
return df_str.write_csv()
def to_json(self, normalize=False, decimals=5):
"""
@ -231,52 +208,7 @@ class DataExportMixin:
Returns:
(str): JSON-formatted string of the results.
"""
return self.to_df(normalize=normalize, decimals=decimals).to_json(orient="records", indent=2)
def to_sql(self, normalize=False, decimals=5, table_name="results", db_path="results.db"):
"""
Save results to an SQLite database.
Args:
normalize (bool, optional): Normalize numeric values.
decimals (int, optional): Decimal precision.
table_name (str, optional): Name of the SQL table.
db_path (str, optional): SQLite database file path.
"""
df = self.to_df(normalize, decimals)
if df.empty or df.columns.empty: # Exit if df is None or has no columns (i.e., no schema)
return
import sqlite3
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Dynamically create table schema based on summary to support prediction and validation results export
columns = []
for col in df.columns:
sample_val = df[col].dropna().iloc[0] if not df[col].dropna().empty else ""
if isinstance(sample_val, dict):
col_type = "TEXT"
elif isinstance(sample_val, (float, int)):
col_type = "REAL"
else:
col_type = "TEXT"
columns.append(f'"{col}" {col_type}') # Quote column names to handle special characters like hyphens
# Create table (Drop table from db if it's already exist)
cursor.execute(f'DROP TABLE IF EXISTS "{table_name}"')
cursor.execute(f'CREATE TABLE "{table_name}" (id INTEGER PRIMARY KEY AUTOINCREMENT, {", ".join(columns)})')
for _, row in df.iterrows():
values = [json.dumps(v) if isinstance(v, dict) else v for v in row]
column_names = ", ".join(f'"{col}"' for col in df.columns)
placeholders = ", ".join("?" for _ in df.columns)
cursor.execute(f'INSERT INTO "{table_name}" ({column_names}) VALUES ({placeholders})', values)
conn.commit()
conn.close()
LOGGER.info(f"Results saved to SQL table '{table_name}' in '{db_path}'.")
return self.to_df(normalize=normalize, decimals=decimals).write_json()
class SimpleClass:

View file

@ -77,7 +77,7 @@ def benchmark(
**kwargs (Any): Additional keyword arguments for exporter.
Returns:
(pandas.DataFrame): A pandas DataFrame with benchmark results for each format, including file size, metric,
(polars.DataFrame): A polars DataFrame with benchmark results for each format, including file size, metric,
and inference time.
Examples:
@ -88,10 +88,11 @@ def benchmark(
imgsz = check_imgsz(imgsz)
assert imgsz[0] == imgsz[1] if isinstance(imgsz, list) else True, "benchmark() only supports square imgsz."
import pandas as pd # scope for faster 'import ultralytics'
import polars as pl # scope for faster 'import ultralytics'
pd.options.display.max_columns = 10
pd.options.display.width = 120
pl.Config.set_tbl_cols(10)
pl.Config.set_tbl_width_chars(120)
pl.Config.set_tbl_hide_dataframe_shape(True)
device = select_device(device, verbose=False)
if isinstance(model, (str, Path)):
model = YOLO(model)
@ -193,20 +194,20 @@ def benchmark(
# Print results
check_yolo(device=device) # print system info
df = pd.DataFrame(y, columns=["Format", "Status❔", "Size (MB)", key, "Inference time (ms/im)", "FPS"])
df = pl.DataFrame(y, schema=["Format", "Status❔", "Size (MB)", key, "Inference time (ms/im)", "FPS"])
name = model.model_name
dt = time.time() - t0
legend = "Benchmarks legend: - ✅ Success - ❎ Export passed but validation failed - ❌️ Export failed"
s = f"\nBenchmarks complete for {name} on {data} at imgsz={imgsz} ({dt:.2f}s)\n{legend}\n{df.fillna('-')}\n"
s = f"\nBenchmarks complete for {name} on {data} at imgsz={imgsz} ({dt:.2f}s)\n{legend}\n{df.fill_null('-')}\n"
LOGGER.info(s)
with open("benchmarks.log", "a", errors="ignore", encoding="utf-8") as f:
f.write(s)
if verbose and isinstance(verbose, float):
metrics = df[key].array # values to compare to floor
metrics = df[key].to_numpy() # values to compare to floor
floor = verbose # minimum metric floor to pass, i.e. = 0.29 mAP for YOLOv5n
assert all(x > floor for x in metrics if pd.notna(x)), f"Benchmark failure: metric(s) < floor {floor}"
assert all(x > floor for x in metrics if not np.isnan(x)), f"Benchmark failure: metric(s) < floor {floor}"
return df

View file

@ -34,13 +34,19 @@ def _custom_table(x, y, classes, title="Precision Recall Curve", x_title="Recall
Returns:
(wandb.Object): A wandb object suitable for logging, showcasing the crafted metric visualization.
"""
import pandas # scope for faster 'import ultralytics'
import polars as pl # scope for faster 'import ultralytics'
import polars.selectors as cs
df = pl.DataFrame({"class": classes, "y": y, "x": x}).with_columns(cs.numeric().round(3))
data = df.select(["class", "y", "x"]).rows()
df = pandas.DataFrame({"class": classes, "y": y, "x": x}).round(3)
fields = {"x": "x", "y": "y", "class": "class"}
string_fields = {"title": title, "x-axis-title": x_title, "y-axis-title": y_title}
return wb.plot_table(
"wandb/area-under-curve/v0", wb.Table(dataframe=df), fields=fields, string_fields=string_fields
"wandb/area-under-curve/v0",
wb.Table(data=data, columns=["class", "y", "x"]),
fields=fields,
string_fields=string_fields,
)

View file

@ -557,7 +557,7 @@ class Annotator:
return width, height, width * height
@TryExcept() # known issue https://github.com/ultralytics/yolov5/issues/5395
@TryExcept()
@plt_settings()
def plot_labels(boxes, cls, names=(), save_dir=Path(""), on_plot=None):
"""
@ -571,7 +571,7 @@ def plot_labels(boxes, cls, names=(), save_dir=Path(""), on_plot=None):
on_plot (Callable, optional): Function to call after plot is saved.
"""
import matplotlib.pyplot as plt # scope for faster 'import ultralytics'
import pandas
import polars
from matplotlib.colors import LinearSegmentedColormap
# Filter matplotlib>=3.7.2 warning
@ -582,16 +582,7 @@ def plot_labels(boxes, cls, names=(), save_dir=Path(""), on_plot=None):
LOGGER.info(f"Plotting labels to {save_dir / 'labels.jpg'}... ")
nc = int(cls.max() + 1) # number of classes
boxes = boxes[:1000000] # limit to 1M boxes
x = pandas.DataFrame(boxes, columns=["x", "y", "width", "height"])
try: # Seaborn correlogram
import seaborn
seaborn.pairplot(x, corner=True, diag_kind="auto", kind="hist", diag_kws=dict(bins=50), plot_kws=dict(pmax=0.9))
plt.savefig(save_dir / "labels_correlogram.jpg", dpi=200)
plt.close()
except ImportError:
pass # Skip if seaborn is not installed
x = polars.DataFrame(boxes, schema=["x", "y", "width", "height"])
# Matplotlib labels
subplot_3_4_color = LinearSegmentedColormap.from_list("white_blue", ["white", "blue"])
@ -608,7 +599,7 @@ def plot_labels(boxes, cls, names=(), save_dir=Path(""), on_plot=None):
boxes = np.column_stack([0.5 - boxes[:, 2:4] / 2, 0.5 + boxes[:, 2:4] / 2]) * 1000
img = Image.fromarray(np.ones((1000, 1000, 3), dtype=np.uint8) * 255)
for cls, box in zip(cls[:500], boxes[:500]):
ImageDraw.Draw(img).rectangle(box, width=1, outline=colors(cls)) # plot
ImageDraw.Draw(img).rectangle(box.tolist(), width=1, outline=colors(cls)) # plot
ax[1].imshow(img)
ax[1].axis("off")
@ -878,7 +869,7 @@ def plot_results(
>>> plot_results("path/to/results.csv", segment=True)
"""
import matplotlib.pyplot as plt # scope for faster 'import ultralytics'
import pandas as pd
import polars as pl
from scipy.ndimage import gaussian_filter1d
save_dir = Path(file).parent if file else Path(dir)
@ -899,11 +890,11 @@ def plot_results(
assert len(files), f"No results.csv files found in {save_dir.resolve()}, nothing to plot."
for f in files:
try:
data = pd.read_csv(f)
data = pl.read_csv(f)
s = [x.strip() for x in data.columns]
x = data.values[:, 0]
x = data.select(data.columns[0]).to_numpy().flatten()
for i, j in enumerate(index):
y = data.values[:, j].astype("float")
y = data.select(data.columns[j]).to_numpy().flatten().astype("float")
# y[y == 0] = np.nan # don't show zero values
ax[i].plot(x, y, marker=".", label=f.stem, linewidth=2, markersize=8) # actual results
ax[i].plot(x, gaussian_filter1d(y, sigma=3), ":", label="smooth", linewidth=2) # smoothing line
@ -965,7 +956,7 @@ def plot_tune_results(csv_file: str = "tune_results.csv"):
>>> plot_tune_results("path/to/tune_results.csv")
"""
import matplotlib.pyplot as plt # scope for faster 'import ultralytics'
import pandas as pd
import polars as pl
from scipy.ndimage import gaussian_filter1d
def _save_one_file(file):
@ -976,10 +967,10 @@ def plot_tune_results(csv_file: str = "tune_results.csv"):
# Scatter plots for each hyperparameter
csv_file = Path(csv_file)
data = pd.read_csv(csv_file)
data = pl.read_csv(csv_file)
num_metrics_columns = 1
keys = [x.strip() for x in data.columns][num_metrics_columns:]
x = data.values
x = data.to_numpy()
fitness = x[:, 0] # fitness
j = np.argmax(fitness) # max fitness index
n = math.ceil(len(keys) ** 0.5) # columns and rows in plot