Extension types for geospatial data for use with 'Arrow'

Overview

geoarrow

R-CMD-check Codecov test coverage

The goal of geoarrow is to prototype Arrow representations of geometry. This is currently a first-draft specification and nothing here should be used for anything except entertainment value.

Installation

You can install the development version from GitHub with:

# install.packages("devtools")
devtools::install_github("paleolimbot/geoarrow")

Write a geoarrow parquet file

This exists for prototyping only, but should work with most things that you throw at it. Notably, sf objects should work out-of-the-box.

library(geoarrow)

nc <- sf::read_sf(system.file("shape/nc.shp", package = "sf"))
write_geoarrow_parquet(nc, "nc.parquet")
arrow::read_parquet("nc.parquet")[c("NAME", "geometry")]
#> # A tibble: 100 × 2
#>    NAME                                                            geometry
#>    <chr>       <list<fixed_size_list<list<fixed_size_list<double, 2>>, 1>>>
#>  1 Ashe                                                                 [1]
#>  2 Alleghany                                                            [1]
#>  3 Surry                                                                [1]
#>  4 Currituck                                                            [3]
#>  5 Northampton                                                          [1]
#>  6 Hertford                                                             [1]
#>  7 Camden                                                               [1]
#>  8 Gates                                                                [1]
#>  9 Warren                                                               [1]
#> 10 Stokes                                                               [1]
#> # … with 90 more rows

You can use any of the schemas to experiment by passing the schema argument:

write_geoarrow_parquet(nc, "nc.parquet", schema = geoarrow_schema_wkt())
arrow::read_parquet("nc.parquet")[c("NAME", "geometry")]
#> # A tibble: 100 × 2
#>    NAME        geometry                                                         
#>    <chr>       <chr>                                                            
#>  1 Ashe        MULTIPOLYGON (((-81.47275543212891 36.23435592651367, -81.540840…
#>  2 Alleghany   MULTIPOLYGON (((-81.23989105224609 36.36536407470703, -81.240692…
#>  3 Surry       MULTIPOLYGON (((-80.45634460449219 36.24255752563477, -80.476387…
#>  4 Currituck   MULTIPOLYGON (((-76.00897216796875 36.31959533691406, -76.017349…
#>  5 Northampton MULTIPOLYGON (((-77.21766662597656 36.24098205566406, -77.234611…
#>  6 Hertford    MULTIPOLYGON (((-76.74506378173828 36.23391723632812, -76.980690…
#>  7 Camden      MULTIPOLYGON (((-76.00897216796875 36.31959533691406, -75.957183…
#>  8 Gates       MULTIPOLYGON (((-76.56250762939453 36.34056854248047, -76.604240…
#>  9 Warren      MULTIPOLYGON (((-78.30876159667969 36.26004028320312, -78.282928…
#> 10 Stokes      MULTIPOLYGON (((-80.02567291259766 36.2502326965332, -80.4530105…
#> # … with 90 more rows

Type examples

First, extension types for the kind of encodings existing libraries support already:

geom_linestring <- wk::wkt("LINESTRING (1 2, 2 3)")

carrow::from_carrow_array(
  geoarrow_create(geom_linestring, schema = geoarrow_schema_wkb()),
  arrow::Array
)
#> Array
#> <fixed_size_binary[41]>
#> [
#>   010200000002000000000000000000F03F000000000000004000000000000000400000000000000840
#> ]

carrow::from_carrow_array(
  geoarrow_create(geom_linestring, schema = geoarrow_schema_wkt()),
  arrow::Array
)
#> Array
#> <string>
#> [
#>   "LINESTRING (1 2, 2 3)"
#> ]

carrow::from_carrow_array(
  geoarrow_create(geom_linestring, schema = geoarrow_schema_geojson()),
  arrow::Array
)
#> Array
#> <string>
#> [
#>   "{"type":"LineString","coordinates":[[1.0,2.0],[2.0,3.0]]}"
#> ]

…but also Arrow-native forms that don’t need to be parsed.

geom_point <- wk::wkt("POINT (1 2)")
geom_linestring <- wk::wkt("LINESTRING (1 2, 2 3)")
geom_poly <- wk::wkt("POLYGON ((0 0, 1 1, 0 1, 0 0))")
geom_multipoint <- wk::wkt("MULTIPOINT (1 2)")
geom_multilinestring <- wk::wkt("MULTILINESTRING ((1 2, 2 3))")
geom_multipoly <- wk::wkt("MULTIPOLYGON (((0 0, 1 1, 0 1, 0 0)))")

carrow::from_carrow_array(geoarrow_create(geom_point), arrow::Array)
#> FixedSizeListArray
#> <fixed_size_list<: double not null>[2]>
#> [
#>   [
#>     1,
#>     2
#>   ]
#> ]
carrow::from_carrow_array(geoarrow_create(geom_linestring), arrow::Array)
#> FixedSizeListArray
#> <fixed_size_list<: fixed_size_list<: double not null>[2] not null>[2]>
#> [
#>   [
#>     [
#>       1,
#>       2
#>     ],
#>     [
#>       2,
#>       3
#>     ]
#>   ]
#> ]
carrow::from_carrow_array(geoarrow_create(geom_poly), arrow::Array)
#> FixedSizeListArray
#> <fixed_size_list<: fixed_size_list<: fixed_size_list<: double not null>[2] not null>[4] not null>[1]>
#> [
#>   [
#>     [
#>       [
#>         0,
#>         0
#>       ],
#>       [
#>         1,
#>         1
#>       ],
#>       [
#>         0,
#>         1
#>       ],
#>       [
#>         0,
#>         0
#>       ]
#>     ]
#>   ]
#> ]
carrow::from_carrow_array(geoarrow_create(geom_multipoint), arrow::Array)
#> FixedSizeListArray
#> <fixed_size_list<: fixed_size_list<: double not null>[2] not null>[1]>
#> [
#>   [
#>     [
#>       1,
#>       2
#>     ]
#>   ]
#> ]
carrow::from_carrow_array(geoarrow_create(geom_multilinestring), arrow::Array)
#> FixedSizeListArray
#> <fixed_size_list<: fixed_size_list<: fixed_size_list<: double not null>[2] not null>[2]>[1]>
#> [
#>   [
#>     [
#>       [
#>         1,
#>         2
#>       ],
#>       [
#>         2,
#>         3
#>       ]
#>     ]
#>   ]
#> ]
carrow::from_carrow_array(geoarrow_create(geom_multipoly), arrow::Array)
#> FixedSizeListArray
#> <fixed_size_list<: fixed_size_list<: fixed_size_list<: fixed_size_list<: double not null>[2] not null>[4] not null>[1]>[1]>
#> [
#>   [
#>     [
#>       [
#>         [
#>           0,
#>           0
#>         ],
#>         [
#>           1,
#>           1
#>         ],
#>         [
#>           0,
#>           1
#>         ],
#>         [
#>           0,
#>           0
#>         ]
#>       ]
#>     ]
#>   ]
#> ]

Collections currently fall back on WKB but could theoretically be supported using a union type:

geom_collection <- wk::wkt("GEOMETRYCOLLECTION (POINT (0 1), LINESTRING (1 1, 2 2))")
carrow::from_carrow_array(geoarrow_create(geom_collection), arrow::Array)
#> Array
#> <fixed_size_binary[71]>
#> [
#>   01070000000200000001010000000000000000000000000000000000F03F010200000002000000000000000000F03F000000000000F03F00000000000000400000000000000040
#> ]

The above examples all have a single feature/ring/linestring, so they fall back to using a fixed-width list. The ability to use a fixed-width list might be important if you have, say, 10 million rectangles that all have 5 points where there’s no need to store a 64-bit offset to each coordinate. For cases where there’s than 2 ^ 31 or more coordinates, the offsets will need to be 64-bit integers (i.e., large_list type). You can force either of these cases for any level of nesting by specifying a schema by hand:

carrow::from_carrow_array(
  geoarrow_create(
    geom_poly,
    schema = geoarrow_schema_polygon(format = c("+L", "+l")),
    strict = TRUE
  ), 
  arrow::Array
)
#> LargeListArray
#> <large_list<: list<: fixed_size_list<: double not null>[2] not null> not null>>
#> [
#>   [
#>     [
#>       [
#>         0,
#>         0
#>       ],
#>       [
#>         1,
#>         1
#>       ],
#>       [
#>         0,
#>         1
#>       ],
#>       [
#>         0,
#>         0
#>       ]
#>     ]
#>   ]
#> ]

There are a few options for point storage. The default is to store coordinates as a fixed-width list of doubles (float64) so that coordinates stay together in memory. This is probably fastest and mirrors the way that some other formats (e.g., WKB) store coordinates.

(points <- wk::xy(1:3, 4:6))
#> <wk_xy[3]>
#> [1] (1 4) (2 5) (3 6)
array <- geoarrow_create(points)
as.numeric(array$array_data$children[[1]]$buffers[[2]])
#> [1] 1 4 2 5 3 6

Alternatively, you can store coordinates as a struct. This is much like storing x and y values in their own columns and might be useful if this is already how a user has these values in memory (e.g., the user read in a table where x and y values were their own columns). It also might be faster for operations like adding and dropping dimensions because the buffers stay separate for each dimension.

array <- geoarrow_create(points, schema = geoarrow_schema_point_struct())
as.numeric(array$array_data$children[[1]]$buffers[[2]])
#> [1] 1 2 3
as.numeric(array$array_data$children[[2]]$buffers[[2]])
#> [1] 4 5 6

For both of these, it might be adequate to store values as float32 instead of float64. I didn’t implement that here yet but in theory it shouldn’t be a problem.

These are far from an exhaustive list of how coordinates can be stored. For example, S2 cell IDs (64-bit integers with ~1 cm resolution describing a location on the globe) are a fast and compact way to encode geographic coordinates. The H3 library has a similar scheme where each 64-bit integer describes a hexagon on the sphere. Some libraries use 32-bit integers scaled by 1e6 to represent longitude and latitude, and some provide an exact decimal type to skirt around floating point precision issues, particularly for internal calculations (e.g., decimal128 or decimal256).

It might not be worth supporting all the point storage formats in any particular implementation; however, the format is such that the coordinate array can be swapped out independent of the rest of the array structure.

Why so many options?

The options are provided here so that implementations can experiment to see if more than one is worth supporting.

Metadata

The schemas here use column-level extension types and extension metadata to encode dimension names, CRS information, and a flag to specify that edges should be considered geodesic rather than Cartesian.

schema <- geoarrow_schema_linestring(
  geodesic = TRUE,
  point = geoarrow_schema_point(crs = "OGC:CRS84")
)
schema$metadata[["ARROW:extension:name"]]
#> [1] "geoarrow::linestring"
geoarrow_metadata(schema)
#> $geodesic
#> [1] "true"
geoarrow_metadata(schema$children[[1]])
#> $crs
#> [1] "OGC:CRS84"
#> 
#> $dim
#> [1] "xy"

I would argue that any string recognized by the latest PROJ release as a CRS is valid for the crs item (which lives with the point array). This includes full WKT2 output, which provides more detail at the expense of including redundant information that can get out of sync. I think “OGC:CRS84” or “EPSG:32620” are no less exact but are perhaps imprecise in a way I don’t yet understand.

Comments
  • point-default.parquet is not readable with pyarrow / arrow C++

    point-default.parquet is not readable with pyarrow / arrow C++

    >>> import pyarrow.parquet as pq
    >>> pq.read_table('inst/example_parquet/point-default.parquet')
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/home/even/arrow/cpp/build/myvenv/lib/python3.8/site-packages/pyarrow/parquet.py", line 1996, in read_table
        return dataset.read(columns=columns, use_threads=use_threads,
      File "/home/even/arrow/cpp/build/myvenv/lib/python3.8/site-packages/pyarrow/parquet.py", line 1831, in read
        table = self._dataset.to_table(
      File "pyarrow/_dataset.pyx", line 323, in pyarrow._dataset.Dataset.to_table
      File "pyarrow/_dataset.pyx", line 2311, in pyarrow._dataset.Scanner.to_table
      File "pyarrow/error.pxi", line 143, in pyarrow.lib.pyarrow_internal_check_status
      File "pyarrow/error.pxi", line 99, in pyarrow.lib.check_status
    pyarrow.lib.ArrowInvalid: Expected all lists to be of size=2 but index 3 had size=0
    

    On a OGR Parquet driver I'm developing, I can also reproduce the same issue with NULL Point. It seems that the Arrow C++ library doesn't correctly handle writing (or reading ? I'm not sure which side is broken) a NULL entry for a FixedSizeList in the Parquet format (this works correctly for Feather). The workaround I found is to write a POINT EMPTY instead of a NULL entry.

    opened by rouault 5
  • WKB with non-2D dimensions doesn't follow ISO encoding

    WKB with non-2D dimensions doesn't follow ISO encoding

    I noticed that WKB geometry with xyz, xym or xyzm coordinate dimension use the 30th and 31th most-significant bits of the int32 flag at offset 1 of the WKB instead of the 2D_code+1000, 2D_code+2000, 2D_code+3000 used by ISO WKB.

    opened by rouault 3
  • Implement C++-native builders for more arrow-native geometries

    Implement C++-native builders for more arrow-native geometries

    #2 implemented WKB and WKT in native C++...this PR does the same for point, linestring, polygon, and collection types.

    • [x] point
    • [x] linestring
    • [x] polygon
    • [x] multipoint
    • [x] multilinestring
    • [x] multipolygon
    opened by paleolimbot 1
  • Implement `arrow::ExtensionType`

    Implement `arrow::ExtensionType`

    Only barely reproducible, since these are both still PRs, but done to test the motivating example behind the arrow extension type PR.

    # remotes::install_github("apache/arrow#12467")
    # remotes::install_github("paleolimbot/[email protected]")
    library(arrow, warn.conflicts = FALSE)
    library(dplyr, warn.conflicts = FALSE)
    library(geoarrow)
    
    places_folder <- system.file("example_dataset/osm_places", package = "geoarrow")
    places <- open_dataset(places_folder)
    places$schema$geometry$type
    #> GeoArrowType
    #> point GEOGCS["WGS 84",DATUM["WGS_...
    places$schema$geometry$type$crs
    #> [1] "GEOGCS[\"WGS 84\",DATUM[\"WGS_1984\",SPHEROID[\"WGS 84\",6378137,298.257223563],AUTHORITY[\"EPSG\",\"6326\"]],PRIMEM[\"Greenwich\",0,AUTHORITY[\"EPSG\",\"8901\"]],UNIT[\"degree\",0.0174532925199433,AUTHORITY[\"EPSG\",\"9122\"]],AXIS[\"Longitude\",EAST],AXIS[\"Latitude\",NORTH]]"
    
    # works!
    Scanner$create(places)$ToTable()
    #> Table
    #> 7255 rows x 6 columns
    #> $osm_id <string>
    #> $code <int32>
    #> $population <double>
    #> $name <string>
    #> $geometry <point GEOGCS["WGS 84",DATUM["WGS_...>
    #> $fclass <string>
    #> 
    #> See $metadata for additional Schema metadata
    
    # works!
    as.data.frame(Scanner$create(places)$ToTable())
    #> # A tibble: 7,255 × 6
    #>    osm_id      code population name           geometry                    fclass
    #>    <chr>      <int>      <dbl> <chr>          <wk_wkb>                    <chr> 
    #>  1 21040334    1001      50781 Roskilde       <POINT (12.08192 55.64335)> city  
    #>  2 21040360    1001      72398 Esbjerg        <POINT (8.452075 55.46649)> city  
    #>  3 26559154    1001      62687 Randers        <POINT (10.03715 56.46175)> city  
    #>  4 26559170    1001      60508 Kolding        <POINT (9.47905 55.4895)>   city  
    #>  5 26559198    1001      56567 Vejle          <POINT (9.533324 55.70001)> city  
    #>  6 26559213    1001     273077 Aarhus         <POINT (10.2134 56.14963)>  city  
    #>  7 26559274    1001     178210 Odense         <POINT (10.38521 55.39972)> city  
    #>  8 1368129781  1001      58646 Horsens        <POINT (9.844477 55.86117)> city  
    #>  9 2247730880  1001     114194 Aalborg        <POINT (9.921526 57.04626)> city  
    #> 10 393558713   1030          0 Englebjerggård <POINT (11.77737 55.2004)>  farm  
    #> # … with 7,245 more rows
    
    # unfortunately, this fails...
    places %>% 
      filter(population > 100000) %>% 
      select(name, population, fclass, geometry) %>% 
      arrange(desc(population)) %>% 
      collect()
    #> Error in `handle_csv_read_error()` at r/R/dplyr-collect.R:33:6:
    #> ! NotImplemented: concatenation of extension<geoarrow.point>
    #> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/array/concatenate.cc:195  VisitTypeInline(*out_->type, this)
    #> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/array/concatenate.cc:590  ConcatenateImpl(data, pool).Concatenate(&out_data)
    #> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc:2025  Concatenate(values.chunks(), ctx->memory_pool())
    #> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc:2084  TakeCA(*table.column(j), indices, options, ctx)
    #> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/exec/sink_node.cc:375  impl_->DoFinish()
    #> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/exec/exec_plan.cc:484  iterator_.Next()
    #> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/record_batch.cc:337  ReadNext(&batch)
    #> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/record_batch.cc:351  ToRecordBatches()
    
    # ...unless we unregister the extension type and use geoarrow_collect()
    arrow::unregister_extension_type("geoarrow.point")
    open_dataset(places_folder) %>% 
      filter(population > 100000) %>% 
      select(name, population, fclass, geometry) %>% 
      arrange(desc(population)) %>% 
      geoarrow_collect()
    #> # A tibble: 5 × 4
    #>   name          population fclass           geometry                   
    #>   <chr>              <dbl> <chr>            <wk_wkb>                   
    #> 1 København         613288 national_capital <POINT (12.57007 55.68672)>
    #> 2 Aarhus            273077 city             <POINT (10.2134 56.14963)> 
    #> 3 Odense            178210 city             <POINT (10.38521 55.39972)>
    #> 4 Aalborg           114194 city             <POINT (9.921526 57.04626)>
    #> 5 Frederiksberg     102029 suburb           <POINT (12.53262 55.67802)>
    

    Created on 2022-03-29 by the reprex package (v2.0.1)

    opened by paleolimbot 1
  • Better sf conversion for Arrow table-like things

    Better sf conversion for Arrow table-like things

    library(geoarrow)
    library(sf)
    #> Linking to GEOS 3.9.1, GDAL 3.2.3, PROJ 7.2.1
    
    vctr <- geoarrow(wk::wkt("POINT (0 1)", crs = "OGC:CRS84"))
    table <- arrow::arrow_table(geom = vctr)
    batch <- arrow::record_batch(geom = vctr)
    dataset <- arrow::InMemoryDataset$create(table)
    query <- dplyr::filter(table, arrow::Expression$scalar(TRUE))
    
    st_crs(query)
    #> Coordinate Reference System:
    #>   User input: OGC:CRS84 
    #>   wkt:
    #> GEOGCRS["WGS 84",
    #>     DATUM["World Geodetic System 1984",
    #>         ELLIPSOID["WGS 84",6378137,298.257223563,
    #>             LENGTHUNIT["metre",1]],
    #>         ID["EPSG",6326]],
    #>     PRIMEM["Greenwich",0,
    #>         ANGLEUNIT["degree",0.0174532925199433],
    #>         ID["EPSG",8901]],
    #>     CS[ellipsoidal,2],
    #>         AXIS["longitude",east,
    #>             ORDER[1],
    #>             ANGLEUNIT["degree",0.0174532925199433,
    #>                 ID["EPSG",9122]]],
    #>         AXIS["latitude",north,
    #>             ORDER[2],
    #>             ANGLEUNIT["degree",0.0174532925199433,
    #>                 ID["EPSG",9122]]]]
    st_bbox(query)
    #> xmin ymin xmax ymax 
    #>    0    1    0    1
    st_geometry(query)
    #> Geometry set for 1 feature 
    #> Geometry type: POINT
    #> Dimension:     XY
    #> Bounding box:  xmin: 0 ymin: 1 xmax: 0 ymax: 1
    #> Geodetic CRS:  WGS 84
    #> POINT (0 1)
    st_as_sf(query)
    #> Simple feature collection with 1 feature and 0 fields
    #> Geometry type: POINT
    #> Dimension:     XY
    #> Bounding box:  xmin: 0 ymin: 1 xmax: 0 ymax: 1
    #> Geodetic CRS:  WGS 84
    #>          geom
    #> 1 POINT (0 1)
    

    Created on 2022-05-20 by the reprex package (v2.0.1)

    opened by paleolimbot 0
  • make sure collect for data.frame propagates CRS

    make sure collect for data.frame propagates CRS

    Before this PR:

    geoarrow::geoarrow_collect(
          data.frame(a = 1, b = wk::wkt("POINT (0 1)", crs = "EPSG:32620")),
          handler = wk::xy_writer()
        ) |> wk::wk_crs()
    #> NULL
    

    After this PR:

    geoarrow::geoarrow_collect(
          data.frame(a = 1, b = wk::wkt("POINT (0 1)", crs = "EPSG:32620")),
          handler = wk::xy_writer()
        ) |> wk::wk_crs()
    #> [1] "EPSG:32620"
    

    Created on 2022-05-19 by the reprex package (v2.0.1)

    opened by paleolimbot 0
  • Better passing in information in default schemas

    Better passing in information in default schemas

    ...the CRS and dimensions were getting dropped for polygons (fixes #16).

    schema <- geoarrow::geoarrow_schema_default(wk::wkt("POLYGON ((0 0, 0 1, 1 0, 0 0))", crs = "EPSG:1234"))
    geoarrow::geoarrow_metadata(schema$children[[1]]$children[[1]])
    #> named list()
    

    After this PR:

    schema <- geoarrow::geoarrow_schema_default(wk::wkt("POLYGON ((0 0, 0 1, 1 0, 0 0))", crs = "EPSG:1234"))
    geoarrow::geoarrow_metadata(schema$children[[1]]$children[[1]])
    #> $crs
    #> [1] "EPSG:1234"
    
    opened by paleolimbot 0
  • `geoarrow()` drops CRS sometimes?

    `geoarrow()` drops CRS sometimes?

    # convert geometry to geoarrow encoding
      geom <- as_geoarrow(
        table_sorted$geometry,
        schema_override = geoarrow_schema_wkb()
      )
      # TODO: this shouldn't drop CRS but it does
      geom <- geoarrow(geom)
    
    opened by paleolimbot 0
  • Align geoarrow writing with GDAL writing

    Align geoarrow writing with GDAL writing

    • [x] Order of fields in geoparquet metadata (to make diffs better)
    • [x] Use 0.0 proper double output rather than integer output for bbox field
    • [x] Don't write Z or M in bbox for now
    opened by paleolimbot 0
  • Regenerate example files using new wkb writer

    Regenerate example files using new wkb writer

    As noted in #5! With the new WKB writer, we now always write ISO WKB whenever converting to geoarrow.wkb.

    library(geoarrow)
    
    tbl <- read_geoarrow_parquet(
      system.file("example_parquet/point_z-wkb.parquet", package = "geoarrow"),
      as_data_frame = FALSE
    )
    
    tbl2 <- read_geoarrow_parquet(
      system.file("example_parquet/point_z-wkb.parquet", package = "geoarrow"),
      handler = geos::geos_geometry_writer()
    )
    
    unclass(as.vector(tbl$geometry))[[1]]
    #>  [1] 01 e9 03 00 00 00 00 00 00 00 00 3e 40 00 00 00 00 00 00 24 40 00 00 00 00
    #> [26] 00 00 44 40
    geos::geos_write_wkb(tbl2$geometry, flavor = "iso")[[1]]
    #>  [1] 01 e9 03 00 00 00 00 00 00 00 00 3e 40 00 00 00 00 00 00 24 40 00 00 00 00
    #> [26] 00 00 44 40
    

    Created on 2022-03-24 by the reprex package (v2.0.1)

    opened by paleolimbot 0
  • GDAL RFC 86 Column-oriented read API for vector layers

    GDAL RFC 86 Column-oriented read API for vector layers

    https://lists.osgeo.org/pipermail/gdal-dev/2022-June/055884.html introduces https://github.com/OSGeo/gdal/pull/5830, https://github.com/rouault/gdal/blob/rfc_86/doc/source/development/rfc/rfc86_column_oriented_api.rst. Looks potentially promising! @edzer - is this helpful in sdsr?

    opened by rsbivand 2
  • Notebook Viewer in RStudio errors when viewing a geoarrow_vctr

    Notebook Viewer in RStudio errors when viewing a geoarrow_vctr

    library(dplyr)
    #> 
    #> Attaching package: 'dplyr'
    #> The following objects are masked from 'package:stats':
    #> 
    #>     filter, lag
    #> The following objects are masked from 'package:base':
    #> 
    #>     intersect, setdiff, setequal, union
    library(arrow, warn.conflicts = FALSE)
    library(geoarrow)
    
    bucket <- s3_bucket("voltrondata-public-datasets")
    ds <- open_dataset(bucket$path("phl-parking"))
    ds %>% 
      head() %>% 
      collect()
    #> # A tibble: 6 × 13
    #>   anon_ticket_number issue_datetime      state anon_plate_id division location  
    #>                <int> <dttm>              <chr>         <int>    <int> <chr>     
    #> 1              39985 2011-12-31 21:17:00 PA          1606959       NA 832 N 40T…
    #> 2              41812 2011-12-31 21:54:00 PA           503820       NA 7200 N 19…
    #> 3              41814 2011-12-31 21:45:00 PA          1102245       NA 7900 PROV…
    #> 4              46288 2011-12-31 20:09:00 NJ           427139       NA 450 N 6TH…
    #> 5              46289 2011-12-31 20:10:00 NJ           308463       NA 448 N 6TH…
    #> 6              46290 2011-12-31 20:12:00 PA          1585402       NA 446 N 6TH…
    #> # … with 7 more variables: violation_desc <chr>, fine <dbl>,
    #> #   issuing_agency <chr>, gps <lgl>, zip_code <int>, geometry <grrw_pnt>,
    #> #   year <int>
    

    (Except in in RStudio Notebook I get:

    Error in wk_handle.geoarrow_vctr(handleable, wkt_format_handler(precision = precision,  : 
      `` is an external pointer to NULL
    
    opened by paleolimbot 0
  • `st_collect()`, `st_as_sf()`, and default conversion from Arrow to R

    `st_collect()`, `st_as_sf()`, and default conversion from Arrow to R

    Right now, geoarrow doesn't convert to sf by default and instead maintains a zero-copy shell around the ChunkedArray from whence it came. This is instantaneous and is kind of like ALTREP for geometry, since we can't do ALTREP on lists like Arrow does for character, integer, and factor. This is up to 10x faster and prevents a full copy of the geometry column. I also rather like that it maintains neutrality between terra, sf, vapour, wk, or others that may come along in the future...who are we to guess where the user wants to put the geometry column next? The destination could be Arrow itself (e.g., via group_by() %>% write_dataset()), or the column could get dropped, filtered, or rearranged before calling an sf method.

    However, 99% of the time a user just wants an sf object. After #20 we can use sf::st_as_sf() on an arrow_dplyr_query to collect() it into an sf object, and @boshek suggested st_collect(), which is a way better name and is more explicit than a st_as_sf(). There's also st_geometry(), st_crs(), st_bbox(), and st_as_crs() methods for the geoarrow_vctr column; however, we still get an awkward error if we collect() and then try to convert to sf:

    vctr <- geoarrow::geoarrow(wk::wkt("POINT (0 1)", crs = "OGC:CRS84"))
    df <- data.frame(geometry = vctr)
    sf::st_as_sf(df)
    #> Error in st_sf(x, ..., agr = agr, sf_column_name = sf_column_name): no simple features geometry column present
    

    That might be solvable in sf, although I'd like to give the current implementation a chance to get tested to collect feedback on whether this is or is not a problem for anybody before committing to the current zero-copy-shell-by-default.

    opened by paleolimbot 8
  • Handle multiple dimensions among features/respect strict = TRUE

    Handle multiple dimensions among features/respect strict = TRUE

    A few options:

    • Error (what happens now)
    • Fill extra dimensions with NaN if strict is TRUE and there are extra dimensions
    • Drop dimensions if strict is TRUE and the dimension isn't supposed to be there

    Perhaps all of those (make a user opt-in to extra dimensions filled with NaN)? Either way, strict = TRUE is might not be respected or might give a different error because the schemas aren't compatible (clearly this isn't tested).

    opened by paleolimbot 0
  • Release geoarrow 0.1.0

    Release geoarrow 0.1.0

    (this is still a few months off, but is a hook to keep track of/discuss progress related to the initial release)

    First release:

    Prepare for release:

    • [ ] devtools::build_readme()
    • [ ] urlchecker::url_check()
    • [ ] devtools::check(remote = TRUE, manual = TRUE)
    • [ ] devtools::check_win_devel()
    • [ ] rhub::check_for_cran()
    • [ ] rhub::check(platform = 'ubuntu-rchk')
    • [ ] rhub::check_with_sanitizers()
    • [ ] Review pkgdown reference index for, e.g., missing topics
    • [ ] Draft blog post

    Submit to CRAN:

    • [ ] usethis::use_version('minor')
    • [ ] devtools::submit_cran()
    • [ ] Approve email

    Wait for CRAN...

    • [ ] Accepted :tada:
    • [ ] usethis::use_github_release()
    • [ ] usethis::use_dev_version()
    • [ ] usethis::use_news_md()
    • [ ] Finish blog post
    • [ ] Tweet
    • [ ] Add link to blog post in pkgdown news menu
    opened by paleolimbot 0
Owner
Dewey Dunnington
R devloper at @voltrondata, former @rstudio summer intern, ggplot2
Dewey Dunnington
Arbitrary Precision provides C++ long integer types that behave as basic integer types. This library aims to be intuitive and versatile in usage, rather than fast.

Arbitrary Precision (AP) Cross-platform and cross-standard header-only arbitrary precision arithmetic library. Currently it offers integer types that

null 17 Sep 28, 2022
rax/RAX is a C++ extension library designed to provide new, fast, and reliable cross-platform class types.

rax rax/RAX is a C++ extension library designed to provide cross-platform new, fast, and reliable class types for different fields such as work with I

MaxHwoy 5 May 2, 2022
An R interface to the 'Apache Arrow' C API

carrow The goal of carrow is to wrap the Arrow Data C API and Arrow Stream C API to provide lightweight Arrow support for R packages to consume and pr

Dewey Dunnington 30 Aug 5, 2022
Proof of Concept 'GeoPackage' to Arrow Converter

gpkg The goal of gpkg is to provide a proof-of-concept reader for SQLite queries into Arrow C Data interface structures. Installation You can install

Dewey Dunnington 8 May 20, 2022
"SaferCPlusPlus" is essentially a collection of safe data types intended to facilitate memory and data race safe C++ programming

A collection of safe data types that are compatible with, and can substitute for, common unsafe native c++ types.

null 328 Nov 21, 2022
A refactored Proof-of-concept originally developed in 2017 to print all function calls with their arguments data types and values using Ptrace during program execution.

print-function-args-debugger A refactored Proof-of-concept originally developed in 2017 to print all function calls with their arguments data types an

*finixbit 15 Jun 17, 2022
Samir Teymurov 1 Oct 6, 2021
This is a tool for software engineers to view,record and analyse data(sensor data and module data) In the process of software development.

![Contributors][Huang Jianyu] Statement 由于工具源码在网上公开,除使用部分开源项目代码外,其余代码均来自我个人,工具本身不包含公司的知识产权,所有与公司有关的内容均从软件包中移除,软件发布遵循Apache协议,任何人均可下载进行修改使用,如使用过程中出现任何问

HuangJianyu 34 May 5, 2022
USENIX 2021 - Nyx: Greybox Hypervisor Fuzzing using Fast Snapshots and Affine Types

Nyx: Greybox Hypervisor Fuzzing using Fast Snapshots and Affine Types Nyx is fast full-VM snapshot fuzzer for type-2 hypervisors. It's built upon kAFL

Chair for Sys­tems Se­cu­ri­ty 159 Nov 9, 2022
Recursive Variant: A simple library for Recursive Variant Types

rva::variant — Recursive Sum Types for C++ Provided by the Recursive Variant Authority. We stand united in opposition to the TVA. May your variants ne

Alecto Irene Perez 66 Nov 24, 2022
Strong type - C++ implementation of strong types

strong_type C++ implementation of strong types Build Status Linux (gcc-8, clang-8) / OSX Table of contents Table of contents What is this ? A tour of

Clément 46 Sep 27, 2022
Tiny - low-level library for minimizing the size of your types

foonathan/tiny Note: This project is currently WIP, no guarantees are made until an 0.1 release. This project is a C++11 library for putting every las

Jonathan Müller 101 Oct 29, 2022
General purpose power controller, capable of driving soldering irons using different voltages and probe types.

All-purpose Power Micro Controller This general purpose power micro controller features: Wheatstone Bridge front-end New Texas Instruments INA823 inst

Tomasz Jastrzębski 29 Oct 9, 2022
std::tuple like methods for user defined types without any macro or boilerplate code

Boost.PFR This is a C++14 library for very basic reflection that gives you access to structure elements by index and provides other std::tuple like me

Boost.org 1.1k Nov 25, 2022
By putting in a lot of speed, the speed sequence is sorted and divided, three types of speed interval distribution maps are generated.(including broken line graph,histogram and curve graph)

Auto-drawing-speed-range-map By putting in a lot of speed, the speed sequence is sorted and divided, three types of speed interval distribution maps a

wellwellAllwen 4 May 14, 2022
Node1D and other 1-dimensional node types for making 1D games in Godot.

Godot 1D Node1D and other 1-dimensional node types for making 1D games in Godot. Have you ever wanted to make 1D games in Godot? ...no? You say you ha

Aaron Franke 12 Jul 31, 2022
Tools for working with Wwise file types (only extraction at the moment)

Wwise Audio Tools This repository is for a static and dynamic library as well as a simple command line tool used to convert Wwise WEM files to OGG fil

RED Modding tools 7 Oct 10, 2022
Quick Look extension for Markdown files on macOS Catalina and Big Sur.

QLMarkdown is a macOS Quick Look extension to preview Markdown files. It can also preview textbundle packages and rmarkdown (.rmd) files.

sbarex 618 Nov 21, 2022
Haxe native extension to read and write windows clipboard.

Haxe Clipboard This is a native library to read and write clipboard data from Haxe. It uses Ammer to generate bindings. Note: This is a Windows only l

Ludovic Bas 12 Nov 11, 2022