Overture Buildings using DuckDB#
Overture Maps has a building footprints dataset that contains “2.35B conflated building footprints from OSM, Esri Community Maps, Microsoft ML Building Footprints, and Google Open Buildings”. We can subset their dataset efficiently with DuckDB using the following function.
import duckdb
from pathlib import Path
THEME_MAP = {
"locality": "admins",
"locality_area": "admins",
"administrative_boundary": "admins",
"building": "buildings",
"building_part": "buildings",
"place": "places",
"segment": "transportation",
"connector": "transportation",
"infrastructure": "base",
"land": "base",
"land_use": "base",
"water": "base",
}
def overture_buildings(
bbox: tuple[float, float, float, float],
overture_type: str,
dst_parquet: str | Path,
) -> None:
"""Query a subset of Overture's buildings data and save it as a GeoParquet file.
Parameters
----------
bbox : tuple
A tuple of floats representing the bounding box of the area of interest
in the format (xmin, ymin, xmax, ymax) and 4326 coordinate reference system.
overture_type : str
The type of Overture data to query. Valid options are:
- ``locality``
- ``locality_area``
- ``administrative_boundary``
- ``building``
- ``building_part``
- ``place``
- ``segment``
- ``connector``
- ``infrastructure``
- ``land``
- ``land_use``
- ``water``
dst_parquet : str or Path
The path to the output GeoParquet file.
"""
s3_region = "us-west-2"
base_url = f"s3://overturemaps-{s3_region}/release"
version = "2024-04-16-beta.0"
if overture_type not in THEME_MAP:
raise ValueError(f"Valid Overture types are: {list(THEME_MAP)}")
theme = THEME_MAP[overture_type]
remote_path = f"{base_url}/{version}/theme={theme}/type={overture_type}/*"
conn = duckdb.connect()
conn.execute("INSTALL httpfs;")
conn.execute("INSTALL spatial;")
conn.execute("LOAD httpfs;")
conn.execute("LOAD spatial;")
conn.execute(f"SET s3_region='{s3_region}';")
read_parquet = f"read_parquet('{remote_path}', filename=true, hive_partitioning=1);"
conn.execute(f"CREATE OR REPLACE VIEW data_view AS SELECT * FROM {read_parquet}")
query = f"""
SELECT
data.*,
FROM data_view AS data
WHERE data.bbox.xmin <= {bbox[2]} AND data.bbox.xmax >= {bbox[0]}
AND data.bbox.ymin <= {bbox[3]} AND data.bbox.ymax >= {bbox[1]}
"""
file = str(Path(dst_parquet).resolve())
conn.execute(f"COPY ({query}) TO '{file}' WITH (FORMAT PARQUET);")
conn.close()
# Manhattan bbox
bbox_example = (-74.02169, 40.696423, -73.891338, 40.831263)
overture_buildings(bbox_example, "building", "nyc_buildings_subset.parquet")
You can read the GeoParquet file with GeoPandas using the following code snippet:
import geopandas as gpd
import pandas as pd
import shapely.wkb
manhatten = pd.read_parquet("nyc_buildings_subset.parquet")
manhatten = gpd.GeoDataFrame(
manhatten.drop(columns="geometry"),
geometry=shapely.wkb.loads(manhatten["geometry"]),
crs=4326,
)