OfficeCLI/src/officecli/Core/PivotTableHelper.Cache.cs
zmworm 4a5b82de37 fix(xlsx): propagate source numFmt to pivot cacheField
When building a pivot from a source range, resolve each source column's
StyleIndex to its numFmtId and stamp it onto the cacheField. Without
this, a date-formatted column (numFmtId 164, yyyy-mm-dd) rendered in
the pivot as raw OADate serials (45306, 45337, ...) instead of the
intended date format. Reuses ResolveColumnNumFmtIds already used for
DataField.NumberFormatId.
2026-04-19 10:24:25 +08:00

882 lines
40 KiB
C#

// Copyright 2025 OfficeCli (officecli.ai)
// SPDX-License-Identifier: Apache-2.0
using System.Text;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Spreadsheet;
namespace OfficeCli.Core;
internal static partial class PivotTableHelper
{
// ==================== Date Grouping Preprocessing ====================
/// <summary>
/// Metadata describing one date-grouped derived field. Used by the cache
/// builder to emit native Excel <c>&lt;fieldGroup&gt;</c> XML that makes
/// Excel recognize the derived field as a proper date bucket (required
/// for the rendered layout to appear — without this, Excel detects a
/// "fieldGroup shape mismatch" and falls back to grand-total only).
/// </summary>
private sealed class DateGroupSpec
{
/// <summary>Index of the original date field in the final columnData list.</summary>
public int BaseFieldIdx { get; set; }
/// <summary>Index of this derived field in the final columnData list.</summary>
public int DerivedFieldIdx { get; set; }
/// <summary>Grouping kind: "year" / "quarter" / "month" / "day".</summary>
public string Grouping { get; set; } = "";
/// <summary>Minimum date observed across the source column.</summary>
public DateTime? MinDate { get; set; }
/// <summary>Maximum date observed across the source column.</summary>
public DateTime? MaxDate { get; set; }
}
/// <summary>
/// Scans rows/cols/filters properties for <c>fieldName:grouping</c> syntax
/// and creates a new virtual column per unique (field, grouping) pair. The
/// original property strings are rewritten in-place so downstream
/// ParseFieldList sees clean names.
///
/// Example: input properties
/// rows = "日期:year,日期:quarter"
/// cols = "产品"
/// With source columns [日期, 产品, 金额], returns:
/// headers = [日期, 产品, 金额, 日期 (Year), 日期 (Quarter)]
/// columnData = [orig days, products, amounts, year labels, quarter labels]
/// dateGroups = [ {Base=0, Derived=3, Grouping=year}, {Base=0, Derived=4, Grouping=quarter} ]
/// And mutates properties to:
/// rows = "日期 (Year),日期 (Quarter)"
///
/// Multiple field specs referencing the same (field, grouping) pair share
/// the single virtual column. Rows that don't parse as dates pass through
/// unchanged so columns with a few stray non-date rows don't break.
/// </summary>
private static (string[] headers, List<string[]> columnData, List<DateGroupSpec> dateGroups) ApplyDateGrouping(
string[] headers, List<string[]> columnData, Dictionary<string, string> properties)
{
// Track virtual columns keyed by (srcIdx, grouping). Value = new
// column's header name, used to rewrite property references.
var virtualColumns = new Dictionary<(int srcIdx, string grouping), string>();
bool RewriteFieldListProp(string propKey)
{
if (!properties.TryGetValue(propKey, out var raw) || string.IsNullOrEmpty(raw))
return false;
var parts = raw.Split(',');
var outParts = new List<string>(parts.Length);
bool changed = false;
foreach (var p in parts)
{
var spec = p.Trim();
if (spec.Length == 0) continue;
// Grouping suffix is allowed only if the prefix matches an
// existing header. Otherwise the ':' might be part of the
// field name (unlikely in practice but allowed by the parser)
// and we must not mangle it.
var colonIdx = spec.LastIndexOf(':');
if (colonIdx <= 0 || colonIdx == spec.Length - 1)
{
outParts.Add(spec);
continue;
}
var fieldName = spec.Substring(0, colonIdx).Trim();
var grouping = spec.Substring(colonIdx + 1).Trim().ToLowerInvariant();
if (grouping != "year" && grouping != "quarter"
&& grouping != "month" && grouping != "day")
{
outParts.Add(spec);
continue;
}
// Locate the source field.
int srcIdx = -1;
for (int i = 0; i < headers.Length; i++)
{
if (headers[i] != null && headers[i].Equals(fieldName, StringComparison.OrdinalIgnoreCase))
{
srcIdx = i;
break;
}
}
if (srcIdx < 0)
{
outParts.Add(spec);
continue;
}
if (!virtualColumns.TryGetValue((srcIdx, grouping), out var virtName))
{
virtName = $"{fieldName} ({CapitalizeFirst(grouping)})";
virtualColumns[(srcIdx, grouping)] = virtName;
}
outParts.Add(virtName);
changed = true;
}
if (changed)
properties[propKey] = string.Join(",", outParts);
return changed;
}
bool any = false;
any |= RewriteFieldListProp("rows");
any |= RewriteFieldListProp("cols");
any |= RewriteFieldListProp("columns");
any |= RewriteFieldListProp("filters");
var dateGroups = new List<DateGroupSpec>();
if (!any || virtualColumns.Count == 0)
return (headers, columnData, dateGroups);
// Materialize each virtual column AND record a DateGroupSpec so the
// cache builder can emit <fieldGroup> XML. Output ordering follows
// the insertion order of virtualColumns (first reference in props).
// Also walk the source date column once to find min/max for the
// rangePr startDate/endDate attributes Excel requires.
var newHeaders = new List<string>(headers);
foreach (var ((srcIdx, grouping), virtName) in virtualColumns)
{
var src = columnData[srcIdx];
var derived = new string[src.Length];
DateTime? min = null, max = null;
for (int r = 0; r < src.Length; r++)
{
derived[r] = BucketDateValue(src[r], grouping);
if (TryParseSourceDate(src[r], out var dt))
{
if (!min.HasValue || dt < min.Value) min = dt;
if (!max.HasValue || dt > max.Value) max = dt;
}
}
newHeaders.Add(virtName);
columnData.Add(derived);
dateGroups.Add(new DateGroupSpec
{
BaseFieldIdx = srcIdx,
DerivedFieldIdx = columnData.Count - 1,
Grouping = grouping,
MinDate = min,
MaxDate = max,
});
}
return (newHeaders.ToArray(), columnData, dateGroups);
}
/// <summary>
/// Parse a cell value as a DateTime, handling both string form
/// ("2024-01-05") and Excel's OLE serial number form ("45296"). Used by
/// ApplyDateGrouping to find the min/max needed for fieldGroup rangePr.
/// </summary>
private static bool TryParseSourceDate(string raw, out DateTime dt)
{
dt = default;
if (string.IsNullOrEmpty(raw)) return false;
// CONSISTENCY(timezone): Use AssumeUniversal+AdjustToUniversal so the parsed
// DateTime has Kind=Utc and no timezone shift occurs when OpenXML SDK serializes
// it. AssumeLocal would produce Kind=Local which the SDK converts to UTC on
// write, shifting dates by the local UTC offset (e.g. UTC+8 shifts Jan 15 → Jan 14).
if (DateTime.TryParse(raw, System.Globalization.CultureInfo.InvariantCulture,
System.Globalization.DateTimeStyles.AssumeUniversal | System.Globalization.DateTimeStyles.AdjustToUniversal, out dt))
return true;
if (double.TryParse(raw, System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out var serial))
{
try { dt = DateTime.FromOADate(serial); return true; }
catch { return false; }
}
return false;
}
/// <summary>
/// Transform a raw cell value into a date bucket label for the given
/// grouping. Accepts either a formatted date string ("2024-01-05") or
/// Excel's serial number form ("45296"). Unparseable values pass through
/// unchanged.
/// </summary>
private static string BucketDateValue(string raw, string grouping)
{
if (string.IsNullOrEmpty(raw)) return raw ?? string.Empty;
DateTime dt;
// CONSISTENCY(timezone): match TryParseSourceDate — use AssumeUniversal to
// avoid Kind=Local which shifts dates by local UTC offset during serialization.
if (!DateTime.TryParse(raw, System.Globalization.CultureInfo.InvariantCulture,
System.Globalization.DateTimeStyles.AssumeUniversal | System.Globalization.DateTimeStyles.AdjustToUniversal, out dt))
{
if (double.TryParse(raw, System.Globalization.NumberStyles.Float,
System.Globalization.CultureInfo.InvariantCulture, out var serial))
{
try { dt = DateTime.FromOADate(serial); }
catch { return raw; }
}
else
{
return raw;
}
}
// Bucket labels must match the canonical names emitted by
// ComputeDateGroupBuckets (Qtr1..Qtr4 / Jan..Dec / 1..31) so the
// cache's groupItems and the renderer's columnData agree on bucket
// identity. Cross-year disambiguation for quarter/month/day is
// handled by the year field (if present as a sibling row/col).
return grouping switch
{
"year" => dt.Year.ToString("D4", System.Globalization.CultureInfo.InvariantCulture),
"quarter" => $"Qtr{(dt.Month - 1) / 3 + 1}",
"month" => MonthShortName(dt.Month),
"day" => dt.Day.ToString(System.Globalization.CultureInfo.InvariantCulture),
_ => raw,
};
}
private static string MonthShortName(int month)
=> month switch
{
1 => "Jan", 2 => "Feb", 3 => "Mar", 4 => "Apr",
5 => "May", 6 => "Jun", 7 => "Jul", 8 => "Aug",
9 => "Sep", 10 => "Oct", 11 => "Nov", 12 => "Dec",
_ => month.ToString(System.Globalization.CultureInfo.InvariantCulture),
};
private static string CapitalizeFirst(string s)
=> string.IsNullOrEmpty(s) ? s : char.ToUpperInvariant(s[0]) + s.Substring(1);
// ==================== Source Data Reader ====================
private static (string[] headers, List<string[]> columnData, uint?[] columnStyleIds) ReadSourceData(
WorksheetPart sourceSheet, string sourceRef)
{
var ws = sourceSheet.Worksheet ?? throw new InvalidOperationException("Worksheet missing");
var sheetData = ws.GetFirstChild<SheetData>();
if (sheetData == null) return (Array.Empty<string>(), new List<string[]>(), Array.Empty<uint?>());
// Parse range "A1:D100"
var parts = sourceRef.Replace("$", "").Split(':');
if (parts.Length != 2) throw new ArgumentException($"Invalid source range: {sourceRef}");
var (startCol, startRow) = ParseCellRef(parts[0]);
var (endCol, endRow) = ParseCellRef(parts[1]);
var startColIdx = ColToIndex(startCol);
var endColIdx = ColToIndex(endCol);
// R6-3: reject columns beyond Excel's hard max (XFD = 16384). Previously
// XFE / XFZ / ZZZZ silently parsed into oversized indices, produced a
// giant colCount, and either crashed deep in the renderer or wrote an
// invalid source range into the cache.
const int ExcelMaxColumn = 16384; // XFD
if (startColIdx > ExcelMaxColumn)
throw new ArgumentException($"Column {startCol} out of range (max: XFD)");
if (endColIdx > ExcelMaxColumn)
throw new ArgumentException($"Column {endCol} out of range (max: XFD)");
var colCount = endColIdx - startColIdx + 1;
// Read all rows in range. We also capture the StyleIndex of the first
// non-empty data cell per column (skipping the header row) so pivot
// value cells can inherit the source column's number format. This
// mirrors how Excel's pivot engine picks the column format: it looks
// at the data-area formatting, not the header.
var rows = new List<string[]>();
var columnStyleIds = new uint?[colCount];
var sst = sourceSheet.OpenXmlPackage is SpreadsheetDocument doc
? doc.WorkbookPart?.GetPartsOfType<SharedStringTablePart>().FirstOrDefault()
: null;
foreach (var row in sheetData.Elements<Row>())
{
var rowIdx = (int)(row.RowIndex?.Value ?? 0);
if (rowIdx < startRow || rowIdx > endRow) continue;
var values = new string[colCount];
foreach (var cell in row.Elements<Cell>())
{
var cellRef = cell.CellReference?.Value ?? "";
var (cn, _) = ParseCellRef(cellRef);
var ci = ColToIndex(cn) - startColIdx;
if (ci < 0 || ci >= colCount) continue;
values[ci] = GetCellText(cell, sst);
// Capture style from first non-header data cell per column.
// rowIdx > startRow skips the header row; we keep the first
// one we encounter and ignore subsequent rows.
if (rowIdx > startRow && columnStyleIds[ci] == null && cell.StyleIndex?.Value is uint sIdx && sIdx != 0)
columnStyleIds[ci] = sIdx;
}
rows.Add(values);
}
if (rows.Count == 0) return (Array.Empty<string>(), new List<string[]>(), Array.Empty<uint?>());
// First row = headers (ensure no nulls)
var headers = rows[0].Select(h => h ?? "").ToArray();
// Remaining rows = data, transposed to column-major for cache
var columnDataList = new List<string[]>();
for (int c = 0; c < colCount; c++)
{
var colVals = new string[rows.Count - 1];
for (int r = 1; r < rows.Count; r++)
colVals[r - 1] = rows[r][c] ?? "";
columnDataList.Add(colVals);
}
return (headers, columnDataList, columnStyleIds);
}
private static string GetCellText(Cell cell, SharedStringTablePart? sst)
{
// Error cells (DataType=Error, e.g. #DIV/0!) must not be treated as string values.
// Return the sentinel so BuildCacheField can emit ErrorItem instead of StringItem.
if (cell.DataType?.Value == CellValues.Error)
return ErrorCellSentinel;
// Handle InlineString cells (t="inlineStr") — used by openpyxl and some other tools
if (cell.DataType?.Value == CellValues.InlineString)
return cell.InlineString?.InnerText ?? "";
var value = cell.CellValue?.Text ?? "";
if (cell.DataType?.Value == CellValues.SharedString && sst?.SharedStringTable != null)
{
if (int.TryParse(value, out int idx))
{
var item = sst.SharedStringTable.Elements<SharedStringItem>().ElementAtOrDefault(idx);
return item?.InnerText ?? value;
}
}
return value;
}
// ==================== Cache Definition Builder ====================
private static (PivotCacheDefinition def, bool[] fieldNumeric, Dictionary<string, int>[] fieldValueIndex)
BuildCacheDefinition(
string sourceSheetName, string sourceRef,
string[] headers, List<string[]> columnData,
HashSet<int>? axisFieldIndices = null,
List<DateGroupSpec>? dateGroups = null,
uint?[]? columnNumFmtIds = null)
{
var recordCount = columnData.Count > 0 ? columnData[0].Length : 0;
// RenderPivotIntoSheet now materializes all pivot cells into sheetData
// (including the N≥3 general renderer), so Excel can display the pre-
// rendered values directly without a cache refresh. Do NOT set
// RefreshOnLoad — it causes Excel to clear the pre-rendered cells and
// attempt a live rebuild from the cache definition. If the rebuild
// fails (e.g. complex N≥3 rowItems structure, security policy blocking
// refresh, or WPS Office's limited pivot support), the user sees an
// empty pivot skeleton instead of the correct data. Real Excel/
// LibreOffice files likewise ship rendered cells without refreshOnLoad.
var cacheDef = new PivotCacheDefinition
{
CreatedVersion = 3,
MinRefreshableVersion = 3,
RefreshedVersion = 3,
RecordCount = (uint)recordCount
};
// CacheSource -> WorksheetSource
var cacheSource = new CacheSource { Type = SourceValues.Worksheet };
cacheSource.AppendChild(new WorksheetSource
{
Reference = sourceRef,
Sheet = sourceSheetName
});
cacheDef.AppendChild(cacheSource);
// CacheFields — also build per-field metadata used to write records:
// - fieldNumeric[i]: true if field i is numeric (records emit <n v=".."/>)
// - fieldValueIndex[i]: value→sharedItems index map for non-numeric fields
// (records emit <x v="N"/> referencing this index)
//
// Date group handling:
// - Base date field gets standard enumerated items PLUS a <fieldGroup
// par="N"/> pointer to the FIRST derived field (Excel's convention).
// - Each derived field writes a synthetic cacheField with
// databaseField="0", a <fieldGroup base="baseIdx"> containing
// <rangePr groupBy="..." startDate=".." endDate=".." /> and a
// <groupItems> list of string labels — including LEADING/TRAILING
// sentinels ("<startDate" / ">endDate") that Excel requires.
// - Derived fields emit NO entries in pivotCacheRecords (databaseField=0).
// BuildCacheRecords in the caller must skip them, which we signal by
// setting fieldNumeric[derivedIdx] = false AND leaving fieldValueIndex
// entries pointing into the enumerated shared items of the synthetic
// field. See BuildCacheRecords for the skip logic.
var fieldNumeric = new bool[headers.Length];
var fieldValueIndex = new Dictionary<string, int>[headers.Length];
// Build quick lookups from the date group specs.
var derivedByIdx = new Dictionary<int, DateGroupSpec>();
var baseFields = new HashSet<int>();
if (dateGroups != null)
{
foreach (var g in dateGroups)
{
derivedByIdx[g.DerivedFieldIdx] = g;
baseFields.Add(g.BaseFieldIdx);
}
}
var cacheFields = new CacheFields { Count = (uint)headers.Length };
for (int i = 0; i < headers.Length; i++)
{
var fieldName = string.IsNullOrEmpty(headers[i]) ? $"Column{i + 1}" : headers[i];
var values = i < columnData.Count ? columnData[i] : Array.Empty<string>();
// R19-1: per-column source numFmtId (date/currency/etc.) to stamp
// on the cacheField so the pivot renders values with the same
// formatting as the source column. Null means "General" and we
// leave the default in place.
uint? srcNumFmtId = (columnNumFmtIds != null && i < columnNumFmtIds.Length)
? columnNumFmtIds[i] : null;
if (derivedByIdx.TryGetValue(i, out var spec))
{
// Derived date group field — synthesized, no records entries.
var derived = BuildDateGroupDerivedCacheField(fieldName, spec,
out fieldValueIndex[i]);
if (srcNumFmtId.HasValue) derived.NumberFormatId = srcNumFmtId.Value;
cacheFields.AppendChild(derived);
fieldNumeric[i] = false; // records should skip this field
continue;
}
if (baseFields.Contains(i))
{
// Base date field — enumerate date items (not a plain numeric
// column) and add a <fieldGroup par="N"/> pointing at the first
// derived field for this base. Records for this field emit
// <x v="N"/> referencing the enumerated date items.
int parIdx = derivedByIdx
.Where(kv => kv.Value.BaseFieldIdx == i)
.Min(kv => kv.Key);
var baseField = BuildDateGroupBaseCacheField(fieldName, values, parIdx,
out fieldValueIndex[i]);
// Prefer the source column's numFmtId when present; else keep
// the builder's 164u default (yyyy-mm-dd).
if (srcNumFmtId.HasValue) baseField.NumberFormatId = srcNumFmtId.Value;
cacheFields.AppendChild(baseField);
fieldNumeric[i] = false;
continue;
}
// Axis fields (row/col/filter) go through the string/indexed path
// even when their values parse as numeric, so pivotField items
// indices and cache record references stay in sync.
bool forceStringIndexed = axisFieldIndices?.Contains(i) == true;
var plainField = BuildCacheField(
fieldName, values, out fieldNumeric[i], out fieldValueIndex[i], forceStringIndexed);
if (srcNumFmtId.HasValue) plainField.NumberFormatId = srcNumFmtId.Value;
cacheFields.AppendChild(plainField);
}
cacheDef.AppendChild(cacheFields);
return (cacheDef, fieldNumeric, fieldValueIndex);
}
private static CacheField BuildCacheField(
string name, string[] values, out bool isNumeric, out Dictionary<string, int> valueIndex,
bool forceStringIndexed = false)
{
var field = new CacheField { Name = name, NumberFormatId = 0u };
// Exclude error-cell sentinels from the numeric check — they are neither
// numeric nor regular strings; they will be emitted as ErrorItem elements.
bool valuesAreNumeric = values.Length > 0 && values.All(v =>
string.IsNullOrEmpty(v) || v == ErrorCellSentinel
|| double.TryParse(v, System.Globalization.CultureInfo.InvariantCulture, out _));
// When forceStringIndexed is true (axis fields), report isNumeric=false
// so downstream record-writing code uses the valueIndex map to emit
// <x v="N"/> references instead of <n v="..."/> direct values. The
// local 'valuesAreNumeric' still determines which sharedItems branch
// we take below.
isNumeric = valuesAreNumeric && !forceStringIndexed;
valueIndex = new Dictionary<string, int>(StringComparer.Ordinal);
var sharedItems = new SharedItems();
// MIXED strategy — verified against Microsoft's own pivot5.xlsx (in
// OPEN-XML-SDK test fixtures, authored by real Excel):
//
// • Numeric fields: emit ONLY containsNumber/minValue/maxValue metadata,
// no enumerated items, no count attribute. Records reference values
// directly via <n v="..."/>.
// • String fields: enumerate every unique value as <s v="..."/> with
// count attribute. Records reference them by index via <x v="N"/>.
//
// I previously experimented with LibreOffice's uniform strategy (always
// enumerate, always index-reference), but Microsoft's actual format is
// the mixed one — and matching the real Excel format is the safest bet
// for round-trip compatibility. The uniform strategy is technically valid
// OOXML but introduces an asymmetry that Excel handles less reliably
// (numeric data fields with item enumeration have failed to render in
// testing, even though the file passes schema validation).
bool hasErrorCells = values.Any(v => v == ErrorCellSentinel);
if (isNumeric && values.Any(v => !string.IsNullOrEmpty(v) && v != ErrorCellSentinel))
{
var nums = values.Where(v => !string.IsNullOrEmpty(v) && v != ErrorCellSentinel)
.Select(v => double.Parse(v, System.Globalization.CultureInfo.InvariantCulture)).ToArray();
sharedItems.ContainsSemiMixedTypes = false;
sharedItems.ContainsString = false;
sharedItems.ContainsNumber = true;
sharedItems.MinValue = nums.Min();
sharedItems.MaxValue = nums.Max();
// No string items enumerated — records emit <n v="..."/> or index ref for errors.
}
else
{
var uniqueValues = values
.Where(v => !string.IsNullOrEmpty(v) && v != ErrorCellSentinel)
.Distinct()
.OrderByAxis(v => v)
.ToList();
// Error cells occupy their own ErrorItem slots after the string items.
var uniqueErrors = values
.Where(v => v == ErrorCellSentinel)
.Distinct()
.ToList();
int totalCount = uniqueValues.Count + uniqueErrors.Count;
sharedItems.Count = (uint)totalCount;
if (hasErrorCells)
{
sharedItems.ContainsSemiMixedTypes = false;
}
for (int i = 0; i < uniqueValues.Count; i++)
{
var v = uniqueValues[i];
// R2-2: strip XML-illegal chars (e.g. U+0000) before writing.
sharedItems.AppendChild(new StringItem { Val = SanitizeXmlText(v) });
if (!valueIndex.ContainsKey(v))
valueIndex[v] = i;
}
// Emit ErrorItem elements for error-cell sentinels.
for (int i = 0; i < uniqueErrors.Count; i++)
{
sharedItems.AppendChild(new ErrorItem { Val = "#VALUE!" });
valueIndex[ErrorCellSentinel] = uniqueValues.Count + i;
}
// OOXML requires longText="1" when any string exceeds 255 chars.
// Without it, Excel reports "problem with some content" and repairs.
if (uniqueValues.Any(v => v.Length > 255))
sharedItems.LongText = true;
}
field.AppendChild(sharedItems);
return field;
}
// ==================== Date Group Cache Field Builders ====================
/// <summary>
/// Build the base date cacheField for a date-grouped column. Enumerates
/// every parsed source date as a <c>&lt;d v="..."/&gt;</c> shared item and
/// appends a <c>&lt;fieldGroup par="N"/&gt;</c> pointing at the first
/// derived field for this base (Excel convention: even when there are
/// multiple derived fields — year + quarter + month — only the lowest
/// par index is written on the base).
///
/// Verified against Excel-authored /tmp/date_authored.xlsx: the base
/// field has <c>containsDate="1"</c>, enumerated ISO-format dates, no
/// <c>containsString</c>/<c>containsNumber</c> attributes.
/// </summary>
private static CacheField BuildDateGroupBaseCacheField(
string name, string[] values, int parDerivedIdx,
out Dictionary<string, int> valueIndex)
{
var field = new CacheField { Name = name, NumberFormatId = 164u };
valueIndex = new Dictionary<string, int>(StringComparer.Ordinal);
// Collect unique parsed dates in source order. Excel enumerates them
// in the order they first appear in the data, which keeps the cache
// record indices stable and human-readable.
var uniqueDates = new List<DateTime>();
var dateToIdx = new Dictionary<DateTime, int>();
DateTime? min = null, max = null;
for (int r = 0; r < values.Length; r++)
{
if (!TryParseSourceDate(values[r], out var dt)) continue;
if (!dateToIdx.ContainsKey(dt))
{
dateToIdx[dt] = uniqueDates.Count;
uniqueDates.Add(dt);
}
if (!min.HasValue || dt < min.Value) min = dt;
if (!max.HasValue || dt > max.Value) max = dt;
}
var sharedItems = new SharedItems
{
ContainsSemiMixedTypes = false,
ContainsNonDate = false,
ContainsDate = true,
ContainsString = false,
Count = (uint)uniqueDates.Count
};
if (min.HasValue) sharedItems.MinDate = min.Value;
if (max.HasValue) sharedItems.MaxDate = max.Value;
foreach (var dt in uniqueDates)
{
sharedItems.AppendChild(new DateTimeItem { Val = dt });
}
// Populate the value→index map so BuildCacheRecords can resolve each
// source row's date value to the correct sharedItems index. The map
// keys are the ORIGINAL raw cell values (not the normalized dates),
// since that's what the record writer will look up.
for (int r = 0; r < values.Length; r++)
{
var raw = values[r];
if (string.IsNullOrEmpty(raw)) continue;
if (valueIndex.ContainsKey(raw)) continue;
if (TryParseSourceDate(raw, out var dt) && dateToIdx.TryGetValue(dt, out var idx))
valueIndex[raw] = idx;
}
field.AppendChild(sharedItems);
// <fieldGroup par="N"/> — the "par" attribute points at the FIRST
// derived field for this base. Verified against /tmp/date_authored.xlsx
// where the base had par=3 pointing at the Quarters field at idx 3.
field.AppendChild(new FieldGroup { ParentId = (uint)parDerivedIdx });
return field;
}
/// <summary>
/// Build a derived date-group cacheField (Year / Quarter / Month / Day)
/// with <c>databaseField="0"</c> and a synthetic <c>&lt;fieldGroup base=&gt;
/// &lt;rangePr groupBy="..."/&gt; &lt;groupItems&gt;...&lt;/groupItems&gt;
/// &lt;/fieldGroup&gt;</c> structure.
///
/// The groupItems list follows Excel's sentinel convention: a leading
/// <c>&lt;startDate</c> and trailing <c>&gt;endDate</c> sentinel bracket
/// the real buckets. Excel uses sentinel indices (0 and last) internally
/// to mark "out of range" values, but for our purposes only the middle
/// real buckets matter. The renderer writes bucket labels directly into
/// sheetData so the sentinel placeholder semantics are moot.
///
/// The valueIndex map lets BuildCacheRecords resolve each source row's
/// bucketed LABEL value back into a groupItems index ≥ 1 (skipping the
/// leading sentinel). Derived fields do NOT emit records entries because
/// databaseField="0", but we still populate the map defensively.
/// </summary>
private static CacheField BuildDateGroupDerivedCacheField(
string name, DateGroupSpec spec, out Dictionary<string, int> valueIndex)
{
valueIndex = new Dictionary<string, int>(StringComparer.Ordinal);
var field = new CacheField
{
Name = name,
NumberFormatId = 0u,
DatabaseField = false // Derived — not backed by a record column
};
// Compute bucket labels for the grouping. The order and count must
// match Excel's convention because rowItems/colItems reference these
// indices. Year buckets are per-year observed in the data; quarter
// labels use the Qtr1..Qtr4 short form Excel writes natively.
List<string> buckets = ComputeDateGroupBuckets(spec);
// Wrap the buckets with Excel's sentinel items:
// idx 0: "<startDate"
// idx 1..N: real buckets (Qtr1, Qtr2, ...; 2024, 2025, ...)
// idx N+1: ">endDate"
var startSentinel = spec.MinDate.HasValue
? "<" + spec.MinDate.Value.ToString("yyyy.MM.dd", System.Globalization.CultureInfo.InvariantCulture)
: "<start";
// Guard against DateTime.MaxValue overflow: if MaxDate is already the
// last representable day, clamp AddDays(1) to DateTime.MaxValue itself
// so the sentinel label and OOXML EndDate remain well-formed.
var endSentinel = spec.MaxDate.HasValue
? ">" + (spec.MaxDate.Value < DateTime.MaxValue.Date
? spec.MaxDate.Value.AddDays(1)
: spec.MaxDate.Value)
.ToString("yyyy.MM.dd", System.Globalization.CultureInfo.InvariantCulture)
: ">end";
var allItems = new List<string>(buckets.Count + 2);
allItems.Add(startSentinel);
allItems.AddRange(buckets);
allItems.Add(endSentinel);
// Populate valueIndex so raw bucket labels (the ones our renderer
// wrote into columnData) resolve to the correct groupItems index.
for (int i = 0; i < buckets.Count; i++)
{
valueIndex[buckets[i]] = i + 1; // +1 for leading sentinel
}
var fieldGroup = new FieldGroup { Base = (uint)spec.BaseFieldIdx };
var rangePr = new RangeProperties
{
GroupBy = spec.Grouping switch
{
"year" => GroupByValues.Years,
"quarter" => GroupByValues.Quarters,
"month" => GroupByValues.Months,
"day" => GroupByValues.Days,
_ => GroupByValues.Days,
},
};
if (spec.MinDate.HasValue) rangePr.StartDate = spec.MinDate.Value;
// CONSISTENCY(date-boundary-clamp): same AddDays(1) guard as endSentinel above.
if (spec.MaxDate.HasValue) rangePr.EndDate = spec.MaxDate.Value < DateTime.MaxValue.Date
? spec.MaxDate.Value.AddDays(1)
: spec.MaxDate.Value;
fieldGroup.AppendChild(rangePr);
var groupItems = new GroupItems { Count = (uint)allItems.Count };
foreach (var label in allItems)
// R2-2: defensive sanitize — date labels are code-generated so
// they shouldn't contain control chars, but keep parity with the
// sharedItems writer in case a format spec ever changes.
groupItems.AppendChild(new StringItem { Val = SanitizeXmlText(label) });
fieldGroup.AppendChild(groupItems);
field.AppendChild(fieldGroup);
return field;
}
/// <summary>
/// Compute the ordered list of bucket labels for a given date group spec.
/// These labels are FIXED across years (matching Excel's native
/// behavior): quarter → Qtr1..Qtr4, month → Jan..Dec, day → 1..31.
/// Year is the exception: it returns the actual observed years.
///
/// Excel treats quarter/month/day as CATEGORICAL fields — the same
/// "Qtr1" bucket applies to all years in the data. Different years of
/// the same quarter disambiguate in the rendered pivot via the
/// rowItems/colItems (year_idx, quarter_idx) tuple, not via label
/// text. Verified against /tmp/date_authored.xlsx where quarters
/// enumerated exactly 4 buckets regardless of year range.
///
/// This is critical: if we emit non-standard labels like "2024-Q1"
/// (which we initially did), Excel's pivot engine crashes when
/// parsing month grouping because it expects Jan..Dec format. The
/// buckets below are the canonical names Excel writes natively.
/// </summary>
private static List<string> ComputeDateGroupBuckets(DateGroupSpec spec)
{
var result = new List<string>();
switch (spec.Grouping)
{
case "year":
// Years ARE actual — observed years in the data.
if (!spec.MinDate.HasValue || !spec.MaxDate.HasValue) return result;
for (int y = spec.MinDate.Value.Year; y <= spec.MaxDate.Value.Year; y++)
result.Add(y.ToString("D4", System.Globalization.CultureInfo.InvariantCulture));
break;
case "quarter":
// Fixed set regardless of year range.
result.AddRange(new[] { "Qtr1", "Qtr2", "Qtr3", "Qtr4" });
break;
case "month":
// Fixed set. Excel uses 3-letter English month abbreviations
// (Jan..Dec) in its native format — verified against Excel's
// quarter-grouping output which emits "Qtr1..Qtr4". We follow
// the same short-form convention for months.
result.AddRange(new[]
{
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
});
break;
case "day":
// Fixed set — day-of-month 1..31.
for (int d = 1; d <= 31; d++)
result.Add(d.ToString(System.Globalization.CultureInfo.InvariantCulture));
break;
}
return result;
}
// ==================== Cache Records Builder ====================
/// <summary>
/// Build pivotCacheRecords using the MIXED strategy verified against Microsoft's
/// own pivot5.xlsx test fixture:
///
/// <r>
/// <x v="0"/> <!-- string field, references sharedItems[0] -->
/// <x v="2"/> <!-- string field, references sharedItems[2] -->
/// <n v="702"/> <!-- numeric field, value written directly -->
/// <m/> <!-- empty/missing value -->
/// </r>
///
/// String fields use indexed references (<x v="N"/>) into the per-field
/// sharedItems list; numeric fields use NumberItem (<n v="V"/>) directly,
/// because their cacheField only carries min/max metadata, not enumerated items.
/// </summary>
private static PivotCacheRecords BuildCacheRecords(
List<string[]> columnData, bool[] fieldNumeric, Dictionary<string, int>[] fieldValueIndex,
HashSet<int>? skipFieldIndices = null)
{
var recordCount = columnData.Count > 0 ? columnData[0].Length : 0;
var fieldCount = columnData.Count;
var records = new PivotCacheRecords { Count = (uint)recordCount };
for (int r = 0; r < recordCount; r++)
{
var record = new PivotCacheRecord();
for (int f = 0; f < fieldCount; f++)
{
// Derived date-group fields carry databaseField="0" and therefore
// don't contribute entries to pivotCacheRecords — they're computed
// on-the-fly by Excel from the base date field's <fieldGroup>
// <rangePr>/<groupItems> definition. Skip them here so the record
// column count matches the non-derived fields.
if (skipFieldIndices?.Contains(f) == true) continue;
var v = columnData[f][r];
if (string.IsNullOrEmpty(v))
{
record.AppendChild(new MissingItem());
}
else if (v == ErrorCellSentinel)
{
// Error cell — reference the ErrorItem in sharedItems if indexed, or
// emit MissingItem for numeric fields that have no sharedItems index.
if (fieldValueIndex[f].TryGetValue(v, out var errIdx))
record.AppendChild(new FieldItem { Val = (uint)errIdx });
else
record.AppendChild(new MissingItem());
}
else if (fieldNumeric[f])
{
record.AppendChild(new NumberItem
{
Val = double.Parse(v, System.Globalization.CultureInfo.InvariantCulture)
});
}
else if (fieldValueIndex[f].TryGetValue(v, out var idx))
{
// FieldItem = <x v="N"/> in OpenXml SDK, references sharedItems[N].
record.AppendChild(new FieldItem { Val = (uint)idx });
}
else
{
// Defensive: value missing from the per-field index map. Should
// not occur since the map is built from the same columnData;
// emit <m/> rather than a dangling reference.
record.AppendChild(new MissingItem());
}
}
records.AppendChild(record);
}
return records;
}
}