Improve dev experience with Python binding (#263)

* Add get_file_paths_by_partitions in Python bindings

* Add makefile, mypy, isort
This commit is contained in:
fvaleye 2021-05-24 19:51:57 +02:00 committed by GitHub
parent 936caaad27
commit 0c3853ea13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 175 additions and 75 deletions

View File

@ -19,20 +19,18 @@ jobs:
uses: actions/setup-python@v2
with:
python-version: 3.6
- name: Format Python code with Black
- name: Check Python
run: |
pip install black
black . --check
pip install black isort mypy
make check-python
- name: Install minimal stable with clippy and rustfmt
uses: actions-rs/toolchain@v1
with:
profile: default
toolchain: stable
override: true
- name: Check
run: cargo clippy
- name: Format
run: cargo fmt -- --check
- name: Check Rust
run: make check-rust
test:
runs-on: ubuntu-latest
@ -57,24 +55,20 @@ jobs:
- name: Enable manylinux Python targets
run: echo "/opt/python/cp36-cp36m/bin" >> $GITHUB_PATH
- name: Install maturin
run: pip install maturin==0.10.4
- name: Build and install deltalake
run: |
make setup
# disable manylinux audit checks for test builds
maturin build --manylinux off
ls -lh ../target/wheels
pip install $(printf ../target/wheels/deltalake-*-cp36-abi3-*.whl)'[devel,pandas]'
- name: Run tests
run: |
py.test --cov tests -m 'not integration'
run: make unit-test
# - name: Run Integration tests
# run: |
# py.test --cov tests -m integration
- name: Build Sphinx documentation
run: |
sphinx-build -Wn -b html -d ./docs/build/doctrees ./docs/source ./docs/build/html
run: make build-documentation

View File

@ -52,14 +52,13 @@ jobs:
with:
python-version: 3.6
- name: Install matruin
run: |
pip install maturin==0.10.4
- name: Setup
run: make setup
- name: Publish to pypi (without sdist)
env:
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: maturin publish -b pyo3 --target ${{ matrix.target }} --no-sdist
run: make publish TARGET="${{ matrix.target }}" EXTRA_ARGS="--no-sdist"
release-github-pypi-manylinux:
needs: validate-release-tag
@ -86,15 +85,14 @@ jobs:
run: |
echo "/opt/python/cp36-cp36m/bin" >> $GITHUB_PATH
- name: Install matruin
run: |
pip install maturin==0.10.4
- name: Setup
run: make setup
- name: Publish manylinux to pypi (without sdist)
env:
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
# linux build uploads sdist to update projection description on PyPI
run: maturin publish -b pyo3 --target x86_64-unknown-linux-gnu
run: make publish TARGET="x86_64-unknown-linux-gnu" EXTRA_ARGS=""
release-docs:
runs-on: ubuntu-latest
@ -111,13 +109,13 @@ jobs:
- name: Build and install deltalake
run: |
pip install maturin==0.10.4
make setup
maturin build --manylinux off
pip install $(printf ../target/wheels/deltalake-*-cp36-abi3-*.whl)'[devel,pandas]'
- name: Build Sphinx documentation
run: |
sphinx-build -Wn -b html -d ./docs/build/doctrees ./docs/source ./docs/build/html
make build-documentation
mv docs/build ~/build
echo "Configuring git..."

13
python/.gitignore vendored
View File

@ -1,3 +1,16 @@
# venv
venv
# Byte-compiled / optimized / DLL files
__pycache__/
# Unit test / coverage reports
.coverage
.pytest_cache/
# mypy
.mypy_cache/
# sphinx build directory
docs/build

View File

@ -43,6 +43,8 @@ requires-dist = [
"pyarrow>=4",
'numpy<1.20.0;python_version<="3.6"',
"pandas; extra =='pandas'",
"mypy; extra == 'devel'",
"isort; extra == 'devel'",
"pytest; extra == 'devel'",
"pytest-mock; extra == 'devel'",
"pytest-cov; extra == 'devel'",
@ -51,4 +53,4 @@ requires-dist = [
"sphinx-rtd-theme; extra == 'devel'",
"toml; extra == 'devel'",
]
provides-extra = ["pandas", "devel"]
provides-extra = ["pandas", "devel"]

69
python/Makefile Normal file
View File

@ -0,0 +1,69 @@
.DEFAULT_GOAL := help
VENV := venv
.PHON: setup-venv
setup-venv: ## Setup the virtualenv
$(info --- Setup virtualenv ---)
python -m venv $(VENV)
.PHONY: setup
setup: ## Setup the requirements
$(info --- Setup dependencies ---)
pip install maturin==0.10.6
.PHONY: develop
develop: setup ## Install Python binding of delta-rs
$(info --- Maturin develop ---)
maturin develop
.PHONY: format
format: ## Format the code
$(info --- Rust format ---)
cargo fmt
$(info --- Python format ---)
black .
isort .
.PHONY: check-rust
check-rust: ## Run check on Rust
$(info --- Check Rust clippy ---)
cargo clippy
$(info --- Check Rust format ---)
cargo fmt -- --check
.PHONY: check-python
check-python: ## Run check on Python
$(info Check Python isort)
isort --check-only .
$(info Check Python black)
black --check .
$(info Check Python mypy)
mypy --exclude "$(VENV)|docs|tests" .
.PHONY: unit-test
unit-test: ## Run unit test
$(info --- Run Python unit-test ---)
py.test --cov tests -m 'not integration'
.PHONY: build-documentation
build-documentation: ## Build documentation with Sphinx
$(info --- Run build of the Sphinx documentation ---)
sphinx-build -Wn -b html -d ./docs/build/doctrees ./docs/source ./docs/build/html
.PHONY: publish
publish: ## Publish Python binding version in Pypi
$(info --- Run publish of the Python binding ---)
maturin publish -b pyo3 --target $(TARGET) $(EXTRA_ARGS)
.PHONY: clean
clean: ## Run clean
$(warning --- Clean virtualenv and target directory ---)
cargo clean
rm -rf $(VENV)
find . -type f -name '*.pyc' -delete
.PHONY: help
help:
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'

View File

@ -24,17 +24,26 @@ openssl upgrade.
Develop
-------
#### Setup your local environment with virtualenv
```bash
$ make setup-venv
```
#### Activate it
```bash
$ source ./venv/bin/activate
```
#### Ready to develop with maturin
[maturin](https://github.com/PyO3/maturin) is used to build the python package.
To install development version of the package into your current Python environment:
```bash
$ maturin develop
$ make develop
```
Code are formatted with https://github.com/psf/black.
Build manylinux wheels
----------------------

View File

@ -1,3 +1,3 @@
from .deltalake import RawDeltaTable, RawDeltaTableMetaData, rust_core_version
from .schema import DataType, Field, Schema
from .table import DeltaTable, Metadata
from .schema import Schema, Field, DataType

View File

@ -1,10 +1,8 @@
from typing import Dict, List, Any, Optional
import json
from typing import Any, Dict, List, Optional
import pyarrow
# TODO: implement this module in Rust land to avoid JSON serialization
# https://github.com/delta-io/delta-rs/issues/95
@ -23,7 +21,7 @@ class DataType:
def __repr__(self) -> str:
return self.__str__()
def __eq__(self, other: "DataType") -> bool:
def __eq__(self, other: "DataType") -> bool: # type: ignore
return self.type == other.type
@classmethod
@ -36,11 +34,11 @@ class DataType:
"""
type_class = json_dict["type"]
if type_class == "map":
key_type = {"type": json_dict["keyType"]}
value_type = {"type": json_dict["valueType"]}
key_type_dict = {"type": json_dict["keyType"]}
value_type_dict = {"type": json_dict["valueType"]}
value_contains_null = json_dict["valueContainsNull"]
key_type = cls.from_dict(json_dict=key_type)
value_type = cls.from_dict(json_dict=value_type)
key_type = cls.from_dict(json_dict=key_type_dict)
value_type = cls.from_dict(json_dict=value_type_dict)
return MapType(
key_type=key_type,
value_type=value_type,
@ -78,13 +76,15 @@ class DataType:
class MapType(DataType):
"""Concrete class for map data types."""
def __init__(self, key_type: str, value_type: str, value_contains_null: bool):
def __init__(
self, key_type: "DataType", value_type: "DataType", value_contains_null: bool
):
super().__init__("map")
self.key_type = key_type
self.value_type = value_type
self.value_contains_null = value_contains_null
def __eq__(self, other: "DataType") -> bool:
def __eq__(self, other: "DataType") -> bool: # type: ignore
return (
isinstance(other, MapType)
and self.key_type == other.key_type
@ -104,7 +104,7 @@ class ArrayType(DataType):
self.element_type = element_type
self.contains_null = contains_null
def __eq__(self, other: "DataType") -> bool:
def __eq__(self, other: "DataType") -> bool: # type: ignore
return (
isinstance(other, ArrayType)
and self.element_type == other.element_type
@ -122,7 +122,7 @@ class StructType(DataType):
super().__init__("struct")
self.fields = fields
def __eq__(self, other: "DataType") -> bool:
def __eq__(self, other: "DataType") -> bool: # type: ignore
return isinstance(other, StructType) and self.fields == other.fields
def __str__(self) -> str:
@ -148,7 +148,7 @@ class Field:
def __str__(self) -> str:
return f"Field({self.name}: {self.type} nullable({self.nullable}) metadata({self.metadata}))"
def __eq__(self, other: "Field") -> bool:
def __eq__(self, other: "Field") -> bool: # type: ignore
return (
self.type == other.type
and self.name == other.name

View File

@ -1,7 +1,7 @@
from typing import List, Optional, Tuple
import os
from typing import Any, List, Optional, Tuple
from urllib.parse import urlparse
import os
import pyarrow
from pyarrow.dataset import dataset, partitioning
@ -16,34 +16,34 @@ class Metadata:
self._metadata = table.metadata()
@property
def id(self):
def id(self) -> int:
"""Return the unique identifier of the DeltaTable."""
return self._metadata.id
@property
def name(self):
def name(self) -> str:
"""Return the user-provided identifier of the DeltaTable."""
return self._metadata.name
@property
def description(self):
def description(self) -> str:
"""Return the user-provided description of the DeltaTable."""
return self._metadata.description
@property
def partition_columns(self):
def partition_columns(self) -> List[str]:
"""Return an array containing the names of the partitioned columns of the DeltaTable."""
return self._metadata.partition_columns
@property
def created_time(self):
def created_time(self) -> int:
"""
Return The time when this metadata action is created, in milliseconds since the Unix epoch of the DeltaTable.
"""
return self._metadata.created_time
@property
def configuration(self):
def configuration(self) -> List[str]:
"""Return the DeltaTable properties."""
return self._metadata.configuration
@ -54,20 +54,6 @@ class Metadata:
f"created_time: {self.created_time}, configuration={self._metadata.configuration})"
)
def __repr__(self) -> str:
return self.__str__()
def __eq__(self, other: "Metadata") -> bool:
return (
isinstance(other, Metadata)
and self._metadata.id == other._metadata.id
and self._metadata.name == other._metadata.name
and self._metadata.description == other._metadata.description
and self._metadata.partition_columns == other._metadata.partition_columns
and self._metadata.created_time == other._metadata.created_time
and self._metadata.configuration == other._metadata.configuration
)
class DeltaTable:
"""Create a DeltaTable instance."""
@ -99,7 +85,9 @@ class DeltaTable:
"""
return self._table.files()
def files_by_partitions(self, partition_filters: List[Tuple]) -> List[str]:
def files_by_partitions(
self, partition_filters: List[Tuple[str, str, Any]]
) -> List[str]:
"""
Get the files that match a given list of partitions filters.
Partitions which do not match the filter predicate will be removed from scanned data.
@ -182,7 +170,7 @@ class DeltaTable:
return pyarrow_schema_from_json(self._table.arrow_schema_json())
def to_pyarrow_dataset(
self, partitions: Optional[List[Tuple]] = None
self, partitions: Optional[List[Tuple[str, str, Any]]] = None
) -> pyarrow.dataset.Dataset:
"""
Build a PyArrow Dataset using data from the DeltaTable.
@ -227,7 +215,7 @@ class DeltaTable:
)
def to_pyarrow_table(
self, partitions: Optional[List[Tuple]] = None
self, partitions: Optional[List[Tuple[str, str, Any]]] = None
) -> pyarrow.Table:
"""
Build a PyArrow Table using data from the DeltaTable.

View File

@ -8,7 +8,6 @@ Loosely based on https://github.com/astropy/astropy/pull/347
import os
import warnings
__licence__ = "BSD (3 clause)"

View File

@ -12,6 +12,7 @@
#
import os
import sys
import toml
sys.path.insert(0, os.path.abspath("../deltalake/"))

20
python/mypy.ini Normal file
View File

@ -0,0 +1,20 @@
[mypy]
disallow_any_generics = True
disallow_subclassing_any = True
disallow_untyped_calls = True
disallow_untyped_defs = True
disallow_incomplete_defs = True
check_untyped_defs = True
disallow_untyped_decorators = True
no_implicit_optional = True
warn_redundant_casts = True
warn_unused_ignores = True
warn_return_any = False
implicit_reexport = False
strict_equality = True
[mypy-pyarrow.*]
ignore_missing_imports = True
[mypy-deltalake.deltalake]
ignore_missing_imports = True

9
python/pyproject.toml Normal file
View File

@ -0,0 +1,9 @@
[build-system]
requires = ["maturin==0.10.6"]
build-backend = "maturin"
[tool.isort]
profile = "black"
[tool.black]
exclude = "venv"

View File

@ -1,15 +1,13 @@
import pyarrow
from deltalake import (
DeltaTable,
Field,
)
from deltalake import DeltaTable, Field
from deltalake.schema import (
DataType,
ArrayType,
DataType,
MapType,
Schema,
StructType,
pyarrow_field_from_dict,
Schema,
)