mirror of https://github.com/delta-io/delta-rs
Improve dev experience with Python binding (#263)
* Add get_file_paths_by_partitions in Python bindings * Add makefile, mypy, isort
This commit is contained in:
parent
936caaad27
commit
0c3853ea13
|
@ -19,20 +19,18 @@ jobs:
|
|||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.6
|
||||
- name: Format Python code with Black
|
||||
- name: Check Python
|
||||
run: |
|
||||
pip install black
|
||||
black . --check
|
||||
pip install black isort mypy
|
||||
make check-python
|
||||
- name: Install minimal stable with clippy and rustfmt
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: default
|
||||
toolchain: stable
|
||||
override: true
|
||||
- name: Check
|
||||
run: cargo clippy
|
||||
- name: Format
|
||||
run: cargo fmt -- --check
|
||||
- name: Check Rust
|
||||
run: make check-rust
|
||||
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
|
@ -57,24 +55,20 @@ jobs:
|
|||
- name: Enable manylinux Python targets
|
||||
run: echo "/opt/python/cp36-cp36m/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Install maturin
|
||||
run: pip install maturin==0.10.4
|
||||
|
||||
- name: Build and install deltalake
|
||||
run: |
|
||||
make setup
|
||||
# disable manylinux audit checks for test builds
|
||||
maturin build --manylinux off
|
||||
ls -lh ../target/wheels
|
||||
pip install $(printf ../target/wheels/deltalake-*-cp36-abi3-*.whl)'[devel,pandas]'
|
||||
|
||||
- name: Run tests
|
||||
run: |
|
||||
py.test --cov tests -m 'not integration'
|
||||
run: make unit-test
|
||||
|
||||
# - name: Run Integration tests
|
||||
# run: |
|
||||
# py.test --cov tests -m integration
|
||||
|
||||
- name: Build Sphinx documentation
|
||||
run: |
|
||||
sphinx-build -Wn -b html -d ./docs/build/doctrees ./docs/source ./docs/build/html
|
||||
run: make build-documentation
|
||||
|
|
|
@ -52,14 +52,13 @@ jobs:
|
|||
with:
|
||||
python-version: 3.6
|
||||
|
||||
- name: Install matruin
|
||||
run: |
|
||||
pip install maturin==0.10.4
|
||||
- name: Setup
|
||||
run: make setup
|
||||
|
||||
- name: Publish to pypi (without sdist)
|
||||
env:
|
||||
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
|
||||
run: maturin publish -b pyo3 --target ${{ matrix.target }} --no-sdist
|
||||
run: make publish TARGET="${{ matrix.target }}" EXTRA_ARGS="--no-sdist"
|
||||
|
||||
release-github-pypi-manylinux:
|
||||
needs: validate-release-tag
|
||||
|
@ -86,15 +85,14 @@ jobs:
|
|||
run: |
|
||||
echo "/opt/python/cp36-cp36m/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Install matruin
|
||||
run: |
|
||||
pip install maturin==0.10.4
|
||||
- name: Setup
|
||||
run: make setup
|
||||
|
||||
- name: Publish manylinux to pypi (without sdist)
|
||||
env:
|
||||
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
|
||||
# linux build uploads sdist to update projection description on PyPI
|
||||
run: maturin publish -b pyo3 --target x86_64-unknown-linux-gnu
|
||||
run: make publish TARGET="x86_64-unknown-linux-gnu" EXTRA_ARGS=""
|
||||
|
||||
release-docs:
|
||||
runs-on: ubuntu-latest
|
||||
|
@ -111,13 +109,13 @@ jobs:
|
|||
|
||||
- name: Build and install deltalake
|
||||
run: |
|
||||
pip install maturin==0.10.4
|
||||
make setup
|
||||
maturin build --manylinux off
|
||||
pip install $(printf ../target/wheels/deltalake-*-cp36-abi3-*.whl)'[devel,pandas]'
|
||||
|
||||
- name: Build Sphinx documentation
|
||||
run: |
|
||||
sphinx-build -Wn -b html -d ./docs/build/doctrees ./docs/source ./docs/build/html
|
||||
make build-documentation
|
||||
mv docs/build ~/build
|
||||
|
||||
echo "Configuring git..."
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
# venv
|
||||
venv
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
||||
# Unit test / coverage reports
|
||||
.coverage
|
||||
.pytest_cache/
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
|
||||
# sphinx build directory
|
||||
docs/build
|
||||
|
||||
|
|
|
@ -43,6 +43,8 @@ requires-dist = [
|
|||
"pyarrow>=4",
|
||||
'numpy<1.20.0;python_version<="3.6"',
|
||||
"pandas; extra =='pandas'",
|
||||
"mypy; extra == 'devel'",
|
||||
"isort; extra == 'devel'",
|
||||
"pytest; extra == 'devel'",
|
||||
"pytest-mock; extra == 'devel'",
|
||||
"pytest-cov; extra == 'devel'",
|
||||
|
@ -51,4 +53,4 @@ requires-dist = [
|
|||
"sphinx-rtd-theme; extra == 'devel'",
|
||||
"toml; extra == 'devel'",
|
||||
]
|
||||
provides-extra = ["pandas", "devel"]
|
||||
provides-extra = ["pandas", "devel"]
|
|
@ -0,0 +1,69 @@
|
|||
.DEFAULT_GOAL := help
|
||||
|
||||
VENV := venv
|
||||
|
||||
|
||||
.PHON: setup-venv
|
||||
setup-venv: ## Setup the virtualenv
|
||||
$(info --- Setup virtualenv ---)
|
||||
python -m venv $(VENV)
|
||||
|
||||
.PHONY: setup
|
||||
setup: ## Setup the requirements
|
||||
$(info --- Setup dependencies ---)
|
||||
pip install maturin==0.10.6
|
||||
|
||||
.PHONY: develop
|
||||
develop: setup ## Install Python binding of delta-rs
|
||||
$(info --- Maturin develop ---)
|
||||
maturin develop
|
||||
|
||||
.PHONY: format
|
||||
format: ## Format the code
|
||||
$(info --- Rust format ---)
|
||||
cargo fmt
|
||||
$(info --- Python format ---)
|
||||
black .
|
||||
isort .
|
||||
|
||||
.PHONY: check-rust
|
||||
check-rust: ## Run check on Rust
|
||||
$(info --- Check Rust clippy ---)
|
||||
cargo clippy
|
||||
$(info --- Check Rust format ---)
|
||||
cargo fmt -- --check
|
||||
|
||||
.PHONY: check-python
|
||||
check-python: ## Run check on Python
|
||||
$(info Check Python isort)
|
||||
isort --check-only .
|
||||
$(info Check Python black)
|
||||
black --check .
|
||||
$(info Check Python mypy)
|
||||
mypy --exclude "$(VENV)|docs|tests" .
|
||||
|
||||
.PHONY: unit-test
|
||||
unit-test: ## Run unit test
|
||||
$(info --- Run Python unit-test ---)
|
||||
py.test --cov tests -m 'not integration'
|
||||
|
||||
.PHONY: build-documentation
|
||||
build-documentation: ## Build documentation with Sphinx
|
||||
$(info --- Run build of the Sphinx documentation ---)
|
||||
sphinx-build -Wn -b html -d ./docs/build/doctrees ./docs/source ./docs/build/html
|
||||
|
||||
.PHONY: publish
|
||||
publish: ## Publish Python binding version in Pypi
|
||||
$(info --- Run publish of the Python binding ---)
|
||||
maturin publish -b pyo3 --target $(TARGET) $(EXTRA_ARGS)
|
||||
|
||||
.PHONY: clean
|
||||
clean: ## Run clean
|
||||
$(warning --- Clean virtualenv and target directory ---)
|
||||
cargo clean
|
||||
rm -rf $(VENV)
|
||||
find . -type f -name '*.pyc' -delete
|
||||
|
||||
.PHONY: help
|
||||
help:
|
||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
|
@ -24,17 +24,26 @@ openssl upgrade.
|
|||
Develop
|
||||
-------
|
||||
|
||||
#### Setup your local environment with virtualenv
|
||||
```bash
|
||||
$ make setup-venv
|
||||
```
|
||||
|
||||
#### Activate it
|
||||
```bash
|
||||
$ source ./venv/bin/activate
|
||||
```
|
||||
|
||||
#### Ready to develop with maturin
|
||||
|
||||
[maturin](https://github.com/PyO3/maturin) is used to build the python package.
|
||||
|
||||
To install development version of the package into your current Python environment:
|
||||
|
||||
```bash
|
||||
$ maturin develop
|
||||
$ make develop
|
||||
```
|
||||
|
||||
Code are formatted with https://github.com/psf/black.
|
||||
|
||||
|
||||
Build manylinux wheels
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
from .deltalake import RawDeltaTable, RawDeltaTableMetaData, rust_core_version
|
||||
from .schema import DataType, Field, Schema
|
||||
from .table import DeltaTable, Metadata
|
||||
from .schema import Schema, Field, DataType
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
from typing import Dict, List, Any, Optional
|
||||
|
||||
import json
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import pyarrow
|
||||
|
||||
|
||||
# TODO: implement this module in Rust land to avoid JSON serialization
|
||||
# https://github.com/delta-io/delta-rs/issues/95
|
||||
|
||||
|
@ -23,7 +21,7 @@ class DataType:
|
|||
def __repr__(self) -> str:
|
||||
return self.__str__()
|
||||
|
||||
def __eq__(self, other: "DataType") -> bool:
|
||||
def __eq__(self, other: "DataType") -> bool: # type: ignore
|
||||
return self.type == other.type
|
||||
|
||||
@classmethod
|
||||
|
@ -36,11 +34,11 @@ class DataType:
|
|||
"""
|
||||
type_class = json_dict["type"]
|
||||
if type_class == "map":
|
||||
key_type = {"type": json_dict["keyType"]}
|
||||
value_type = {"type": json_dict["valueType"]}
|
||||
key_type_dict = {"type": json_dict["keyType"]}
|
||||
value_type_dict = {"type": json_dict["valueType"]}
|
||||
value_contains_null = json_dict["valueContainsNull"]
|
||||
key_type = cls.from_dict(json_dict=key_type)
|
||||
value_type = cls.from_dict(json_dict=value_type)
|
||||
key_type = cls.from_dict(json_dict=key_type_dict)
|
||||
value_type = cls.from_dict(json_dict=value_type_dict)
|
||||
return MapType(
|
||||
key_type=key_type,
|
||||
value_type=value_type,
|
||||
|
@ -78,13 +76,15 @@ class DataType:
|
|||
class MapType(DataType):
|
||||
"""Concrete class for map data types."""
|
||||
|
||||
def __init__(self, key_type: str, value_type: str, value_contains_null: bool):
|
||||
def __init__(
|
||||
self, key_type: "DataType", value_type: "DataType", value_contains_null: bool
|
||||
):
|
||||
super().__init__("map")
|
||||
self.key_type = key_type
|
||||
self.value_type = value_type
|
||||
self.value_contains_null = value_contains_null
|
||||
|
||||
def __eq__(self, other: "DataType") -> bool:
|
||||
def __eq__(self, other: "DataType") -> bool: # type: ignore
|
||||
return (
|
||||
isinstance(other, MapType)
|
||||
and self.key_type == other.key_type
|
||||
|
@ -104,7 +104,7 @@ class ArrayType(DataType):
|
|||
self.element_type = element_type
|
||||
self.contains_null = contains_null
|
||||
|
||||
def __eq__(self, other: "DataType") -> bool:
|
||||
def __eq__(self, other: "DataType") -> bool: # type: ignore
|
||||
return (
|
||||
isinstance(other, ArrayType)
|
||||
and self.element_type == other.element_type
|
||||
|
@ -122,7 +122,7 @@ class StructType(DataType):
|
|||
super().__init__("struct")
|
||||
self.fields = fields
|
||||
|
||||
def __eq__(self, other: "DataType") -> bool:
|
||||
def __eq__(self, other: "DataType") -> bool: # type: ignore
|
||||
return isinstance(other, StructType) and self.fields == other.fields
|
||||
|
||||
def __str__(self) -> str:
|
||||
|
@ -148,7 +148,7 @@ class Field:
|
|||
def __str__(self) -> str:
|
||||
return f"Field({self.name}: {self.type} nullable({self.nullable}) metadata({self.metadata}))"
|
||||
|
||||
def __eq__(self, other: "Field") -> bool:
|
||||
def __eq__(self, other: "Field") -> bool: # type: ignore
|
||||
return (
|
||||
self.type == other.type
|
||||
and self.name == other.name
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from typing import List, Optional, Tuple
|
||||
import os
|
||||
from typing import Any, List, Optional, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import os
|
||||
import pyarrow
|
||||
from pyarrow.dataset import dataset, partitioning
|
||||
|
||||
|
@ -16,34 +16,34 @@ class Metadata:
|
|||
self._metadata = table.metadata()
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
def id(self) -> int:
|
||||
"""Return the unique identifier of the DeltaTable."""
|
||||
return self._metadata.id
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
def name(self) -> str:
|
||||
"""Return the user-provided identifier of the DeltaTable."""
|
||||
return self._metadata.name
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
def description(self) -> str:
|
||||
"""Return the user-provided description of the DeltaTable."""
|
||||
return self._metadata.description
|
||||
|
||||
@property
|
||||
def partition_columns(self):
|
||||
def partition_columns(self) -> List[str]:
|
||||
"""Return an array containing the names of the partitioned columns of the DeltaTable."""
|
||||
return self._metadata.partition_columns
|
||||
|
||||
@property
|
||||
def created_time(self):
|
||||
def created_time(self) -> int:
|
||||
"""
|
||||
Return The time when this metadata action is created, in milliseconds since the Unix epoch of the DeltaTable.
|
||||
"""
|
||||
return self._metadata.created_time
|
||||
|
||||
@property
|
||||
def configuration(self):
|
||||
def configuration(self) -> List[str]:
|
||||
"""Return the DeltaTable properties."""
|
||||
return self._metadata.configuration
|
||||
|
||||
|
@ -54,20 +54,6 @@ class Metadata:
|
|||
f"created_time: {self.created_time}, configuration={self._metadata.configuration})"
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.__str__()
|
||||
|
||||
def __eq__(self, other: "Metadata") -> bool:
|
||||
return (
|
||||
isinstance(other, Metadata)
|
||||
and self._metadata.id == other._metadata.id
|
||||
and self._metadata.name == other._metadata.name
|
||||
and self._metadata.description == other._metadata.description
|
||||
and self._metadata.partition_columns == other._metadata.partition_columns
|
||||
and self._metadata.created_time == other._metadata.created_time
|
||||
and self._metadata.configuration == other._metadata.configuration
|
||||
)
|
||||
|
||||
|
||||
class DeltaTable:
|
||||
"""Create a DeltaTable instance."""
|
||||
|
@ -99,7 +85,9 @@ class DeltaTable:
|
|||
"""
|
||||
return self._table.files()
|
||||
|
||||
def files_by_partitions(self, partition_filters: List[Tuple]) -> List[str]:
|
||||
def files_by_partitions(
|
||||
self, partition_filters: List[Tuple[str, str, Any]]
|
||||
) -> List[str]:
|
||||
"""
|
||||
Get the files that match a given list of partitions filters.
|
||||
Partitions which do not match the filter predicate will be removed from scanned data.
|
||||
|
@ -182,7 +170,7 @@ class DeltaTable:
|
|||
return pyarrow_schema_from_json(self._table.arrow_schema_json())
|
||||
|
||||
def to_pyarrow_dataset(
|
||||
self, partitions: Optional[List[Tuple]] = None
|
||||
self, partitions: Optional[List[Tuple[str, str, Any]]] = None
|
||||
) -> pyarrow.dataset.Dataset:
|
||||
"""
|
||||
Build a PyArrow Dataset using data from the DeltaTable.
|
||||
|
@ -227,7 +215,7 @@ class DeltaTable:
|
|||
)
|
||||
|
||||
def to_pyarrow_table(
|
||||
self, partitions: Optional[List[Tuple]] = None
|
||||
self, partitions: Optional[List[Tuple[str, str, Any]]] = None
|
||||
) -> pyarrow.Table:
|
||||
"""
|
||||
Build a PyArrow Table using data from the DeltaTable.
|
||||
|
|
|
@ -8,7 +8,6 @@ Loosely based on https://github.com/astropy/astropy/pull/347
|
|||
import os
|
||||
import warnings
|
||||
|
||||
|
||||
__licence__ = "BSD (3 clause)"
|
||||
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#
|
||||
import os
|
||||
import sys
|
||||
|
||||
import toml
|
||||
|
||||
sys.path.insert(0, os.path.abspath("../deltalake/"))
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
[mypy]
|
||||
disallow_any_generics = True
|
||||
disallow_subclassing_any = True
|
||||
disallow_untyped_calls = True
|
||||
disallow_untyped_defs = True
|
||||
disallow_incomplete_defs = True
|
||||
check_untyped_defs = True
|
||||
disallow_untyped_decorators = True
|
||||
no_implicit_optional = True
|
||||
warn_redundant_casts = True
|
||||
warn_unused_ignores = True
|
||||
warn_return_any = False
|
||||
implicit_reexport = False
|
||||
strict_equality = True
|
||||
|
||||
[mypy-pyarrow.*]
|
||||
ignore_missing_imports = True
|
||||
|
||||
[mypy-deltalake.deltalake]
|
||||
ignore_missing_imports = True
|
|
@ -0,0 +1,9 @@
|
|||
[build-system]
|
||||
requires = ["maturin==0.10.6"]
|
||||
build-backend = "maturin"
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
|
||||
[tool.black]
|
||||
exclude = "venv"
|
|
@ -1,15 +1,13 @@
|
|||
import pyarrow
|
||||
from deltalake import (
|
||||
DeltaTable,
|
||||
Field,
|
||||
)
|
||||
|
||||
from deltalake import DeltaTable, Field
|
||||
from deltalake.schema import (
|
||||
DataType,
|
||||
ArrayType,
|
||||
DataType,
|
||||
MapType,
|
||||
Schema,
|
||||
StructType,
|
||||
pyarrow_field_from_dict,
|
||||
Schema,
|
||||
)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue