Overture Buildings using PyArrow

Overture Buildings using PyArrow#

Overture Maps has a building footprints dataset that contains “2.35B conflated building footprints from OSM, Esri Community Maps, Microsoft ML Building Footprints, and Google Open Buildings”. We can subset their dataset efficiently using pyarrow:

from pathlib import Path

import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
import pyarrow.parquet as pq
from pyarrow import fs


def get_buildings(
    bbox: tuple[float, float, float, float], path_parquet: str | Path
) -> gpd.GeoDataFrame:
    """Retrieve building data from Overture Maps for a given bounding box.

    Notes
    -----
    This function is based on
    `overturemaps-py <https://github.com/OvertureMaps/overturemaps-py>`__.

    Parameters
    ----------
    bbox : tuple
        Bounding box coordinates (xmin, ymin, xmax, ymax)
    path_parquet : str or Path
        Path to save the output file

    Returns
    -------
    geopandas.GeoDataFrame
        GeoDataFrame containing the building data
    """
    path_parquet = Path(path_parquet)
    if path_parquet.suffix != ".parquet":
        msg = "The output file must be a GeoParquet file with `.parquet` extension."
        raise ValueError(msg)

    if not isinstance(bbox, tuple | list) or len(bbox) != 4:
        msg = "The bounding box must be a tuple of four elements."
        raise ValueError

    s3_region = "us-west-2"
    version = "2024-10-23.0"
    src = f"overturemaps-{s3_region}/release/{version}/theme=buildings/type=building/"
    xmin, ymin, xmax, ymax = bbox
    filter = (
        (pc.field("bbox", "xmin") < xmax)
        & (pc.field("bbox", "xmax") > xmin)
        & (pc.field("bbox", "ymin") < ymax)
        & (pc.field("bbox", "ymax") > ymin)
    )

    dataset = ds.dataset(
        src, filesystem=fs.S3FileSystem(anonymous=True, region=s3_region)
    )
    batches = dataset.to_batches(filter=filter)
    non_empty_batches = (b for b in batches if b.num_rows > 0)

    geoarrow_schema = dataset.schema.set(
        dataset.schema.get_field_index("geometry"),
        dataset.schema.field("geometry").with_metadata(
            {b"ARROW:extension:name": b"geoarrow.wkb"}
        ),
    )
    reader = pa.RecordBatchReader.from_batches(geoarrow_schema, non_empty_batches)

    with pq.ParquetWriter(path_parquet, reader.schema) as writer:
        for batch in reader:
            if batch.num_rows > 0:
                writer.write_batch(batch)

    return gpd.read_parquet(path_parquet)

Note that you can set the version to the latest by checking Overture’s release notes.