Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
4cb60e6
Implement BaseDtypeTests for ArrowStringDtype
xhochy Jul 10, 2020
d242f2d
Refactor to use parametrized StringDtype
TomAugspurger Sep 3, 2020
d39ab2c
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Feb 18, 2021
2367810
abs-imports
simonjayhawkins Feb 18, 2021
9166d3b
post merge fixup
simonjayhawkins Feb 19, 2021
8760705
StringDtype[python] -> string[python]
simonjayhawkins Feb 19, 2021
d5b3fec
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 22, 2021
2c657df
pre-commit fix for inconsistent use of pandas namespace
simonjayhawkins Mar 22, 2021
647a6c2
fix typo
simonjayhawkins Mar 22, 2021
0596fd7
pre-commit fixup - undefined name 'ArrowStringDtype'
simonjayhawkins Mar 22, 2021
c5a19c5
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 26, 2021
99680c9
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 28, 2021
69a6cc1
"StringDtype[storage]" -> "string[storage]" misc
simonjayhawkins Mar 28, 2021
bd147ba
__from_arrow__
simonjayhawkins Mar 28, 2021
830275f
more testing (wip)
simonjayhawkins Mar 28, 2021
214e524
fix inference
simonjayhawkins Mar 28, 2021
c9ba03c
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 29, 2021
7425536
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 1, 2021
68ac391
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 1, 2021
5cfa97a
post-merge fixup
simonjayhawkins Apr 1, 2021
74dbf96
remove changes to test_string_dtype - broken off in #40725
simonjayhawkins Apr 1, 2021
3985943
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 15, 2021
3bda421
post merge fix-up
simonjayhawkins Apr 15, 2021
0c108a4
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 15, 2021
523e24c
post merge fix-up
simonjayhawkins Apr 15, 2021
279624c
revert some changes made for pre-commit checks.
simonjayhawkins Apr 15, 2021
80d231e
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 16, 2021
c5ced5a
post merge fix-up
simonjayhawkins Apr 16, 2021
459812c
undo unrelated changes
simonjayhawkins Apr 16, 2021
d707b6b
undo changes to imports
simonjayhawkins Apr 16, 2021
71ccf24
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 17, 2021
daaac06
StringDtype.construct_array_type - add ref to issue
simonjayhawkins Apr 17, 2021
46626d1
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 19, 2021
3677bfa
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 1, 2021
42d382f
post merge fixup
simonjayhawkins May 1, 2021
4fb1a0d
add draft release note
simonjayhawkins May 1, 2021
5d4eac1
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 12, 2021
15efb2e
post merge fix-up
simonjayhawkins May 12, 2021
b53cfe0
docstrings
simonjayhawkins May 12, 2021
b7db53f
benchmarks
simonjayhawkins May 12, 2021
3399f08
pyarrow min
simonjayhawkins May 12, 2021
e365f01
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 26, 2021
71d1e6c
post merge fixup
simonjayhawkins May 26, 2021
9e23c35
misc clean
simonjayhawkins May 26, 2021
c69a611
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 27, 2021
64b3206
update construct_from_string docstring
simonjayhawkins May 27, 2021
d83a4ff
update whatsnew for dtype="string"
simonjayhawkins May 27, 2021
ef38660
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 30, 2021
aef1162
update release note
simonjayhawkins May 30, 2021
6247a5b
paramertize test for df.convert_dtypes()
simonjayhawkins May 30, 2021
a6d066c
fixup pd.array and more testing of string_storage option
simonjayhawkins May 31, 2021
8adb08d
use string_storage fixture more
simonjayhawkins May 31, 2021
3ad0638
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 31, 2021
56714c9
post merge fixup
simonjayhawkins May 31, 2021
6a1cc2b
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Jun 2, 2021
1761a84
remove accessor methods section from release note
simonjayhawkins Jun 2, 2021
3e26baa
consistent dtype naming in benchmark
simonjayhawkins Jun 2, 2021
6b470b1
Apply suggestions from code review
simonjayhawkins Jun 2, 2021
2ec6de0
name and str() change to "string"
simonjayhawkins Jun 2, 2021
a0b7a70
remove testing of sting dtype without storage specified.
simonjayhawkins Jun 2, 2021
d9dcd20
update StringDtype docstring
simonjayhawkins Jun 2, 2021
4a37470
add ArrowStringArray to pd.arrays namespace
simonjayhawkins Jun 2, 2021
1d59c7a
add common base class, BaseStringArray
simonjayhawkins Jun 2, 2021
e57c850
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Jun 4, 2021
51f1b1d
fixup roundtrip tests
simonjayhawkins Jun 4, 2021
fc95c06
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Jun 7, 2021
ef02a43
remove link
simonjayhawkins Jun 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fixup pd.array and more testing of string_storage option
  • Loading branch information
simonjayhawkins committed May 31, 2021
commit a6d066ca43f44879f4a01c74c805b2bf4b0790b7
16 changes: 16 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1131,6 +1131,22 @@ def nullable_string_dtype(request):
return request.param


@pytest.fixture(
params=[
"python",
pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")),
]
)
def string_storage(request):
"""
Parametrized fixture for pd.options.mode.string_storage.

* 'python'
* 'pyarrow'
"""
return request.param


@pytest.fixture(params=tm.BYTES_DTYPES)
def bytes_dtype(request):
"""
Expand Down
11 changes: 6 additions & 5 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def __init__(self, values, copy=False):
super().__init__(values, copy=copy)
# error: Incompatible types in assignment (expression has type "StringDtype",
# variable has type "PandasDtype")
NDArrayBacked.__init__(self, self._ndarray, StringDtype())
NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
if not isinstance(values, type(self)):
self._validate()

Expand All @@ -311,8 +311,9 @@ def _validate(self):

@classmethod
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
if dtype:
assert dtype == "string"
if dtype and not (isinstance(dtype, str) and dtype == "string"):
dtype = pandas_dtype(dtype)
assert isinstance(dtype, StringDtype) and dtype.storage == "python"

from pandas.core.arrays.masked import BaseMaskedArray

Expand All @@ -332,7 +333,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
# Manually creating new array avoids the validation step in the __init__, so is
# faster. Refactor need for validation?
new_string_array = cls.__new__(cls)
NDArrayBacked.__init__(new_string_array, result, StringDtype())
NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))

return new_string_array

Expand Down Expand Up @@ -501,7 +502,7 @@ def _str_map(
from pandas.arrays import BooleanArray

if dtype is None:
dtype = StringDtype()
dtype = StringDtype(storage="python")
if na_value is None:
na_value = self.dtype.na_value

Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.missing import isna

Expand Down Expand Up @@ -154,6 +155,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)

cls._chk_pyarrow_available()

if dtype and not (isinstance(dtype, str) and dtype == "string"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we add some introspection functions

is_string_python_dtype
is_string_arrow_dtype
is_string_dtype

to encompass operations

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

followon ok if you can create an issue

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can discuss this once we have merged what we got in case things change. This conditional is slightly different from what would be in the introspection functions since this allows "string" whatever the global storage whereas StringDtype(), without the storage given uses the global option.

I'm probably overthinking or completely misunderstood @jorisvandenbossche suggestion that "StringDtype()" should defer the lookup.

dtype = pandas_dtype(dtype)
assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"

if isinstance(scalars, BaseMaskedArray):
# avoid costly conversion to object dtype in ensure_string_array and
# numerical issues with Float32Dtype
Expand Down
25 changes: 19 additions & 6 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,18 +113,22 @@ def array(

Currently, pandas will infer an extension dtype for sequences of

============================== =====================================
============================== =======================================
Scalar Type Array Type
============================== =====================================
============================== =======================================
:class:`pandas.Interval` :class:`pandas.arrays.IntervalArray`
:class:`pandas.Period` :class:`pandas.arrays.PeriodArray`
:class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray`
:class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray`
:class:`int` :class:`pandas.arrays.IntegerArray`
:class:`float` :class:`pandas.arrays.FloatingArray`
:class:`str` :class:`pandas.arrays.StringArray`
:class:`str` :class:`pandas.arrays.StringArray` or
:class:`pandas.arrays.ArrowStringArray`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ArrowStringArray is not added to the API docs, so this link won't work right now

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe should merge #40962 first (or cut down alternative) bit of chicken and egg, before this is merged naming can change, and vice-versa, I guess having both in the same PR is the only good solution even if more to review.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we are all agreed with the name ArrowStringArray. so perhaps nothing stopping us making it public now.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done in 4a37470

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is an implementation detail. hopefully we can revert

:class:`bool` :class:`pandas.arrays.BooleanArray`
============================== =====================================
============================== =======================================

The ExtensionArray created when the scalar type is :class:`str` is determined by
pd.options.mode.string_storage if the dtype is not explicitly given.

For all other cases, NumPy's usual inference rules will be used.

Expand Down Expand Up @@ -240,6 +244,14 @@ def array(
['a', <NA>, 'c']
Length: 3, dtype: string[python]

>>> with pd.option_context("string_storage", "pyarrow"):
... arr = pd.array(["a", None, "c"])
...
>>> arr
<ArrowStringArray>
['a', <NA>, 'c']
Length: 3, dtype: string[pyarrow]

>>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
<PeriodArray>
['2000-01-01', '2000-01-01']
Expand Down Expand Up @@ -292,10 +304,10 @@ def array(
IntegerArray,
IntervalArray,
PandasArray,
StringArray,
TimedeltaArray,
period_array,
)
from pandas.core.arrays.string_ import StringDtype

if lib.is_scalar(data):
msg = f"Cannot pass scalar '{data}' to 'pandas.array'."
Expand Down Expand Up @@ -345,7 +357,8 @@ def array(
return TimedeltaArray._from_sequence(data, copy=copy)

elif inferred_dtype == "string":
return StringArray._from_sequence(data, copy=copy)
# StringArray/ArrowStringArray depending on pd.options.mode.string_storage
return StringDtype().construct_array_type()._from_sequence(data, copy=copy)

elif inferred_dtype == "integer":
return IntegerArray._from_sequence(data, copy=copy)
Expand Down
74 changes: 64 additions & 10 deletions pandas/tests/arrays/string_/test_string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,33 @@

pa = pytest.importorskip("pyarrow", minversion="1.0.0")

from pandas.core.arrays.string_ import (
StringArray,
StringDtype,
)
from pandas.core.arrays.string_arrow import ArrowStringArray


def test_eq_all_na():
a = pd.array([pd.NA, pd.NA], dtype=pd.StringDtype("pyarrow"))
a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow"))
result = a == a
expected = pd.array([pd.NA, pd.NA], dtype="boolean")
tm.assert_extension_array_equal(result, expected)


def test_config():
# python by default
assert pd.StringDtype().storage == "python"
arr = pd.array(["a", "b"])
assert arr.dtype.storage == "python"
def test_config(string_storage):
with pd.option_context("string_storage", string_storage):
assert StringDtype().storage == string_storage
result = pd.array(["a", "b"])
assert result.dtype.storage == string_storage

with pd.option_context("mode.string_storage", "pyarrow"):
assert pd.StringDtype().storage == "pyarrow"
arr = pd.array(["a", "b"])
assert arr.dtype.storage == "pyarrow"
expected = (
StringDtype(string_storage).construct_array_type()._from_sequence(["a", "b"])
)
tm.assert_equal(result, expected)


def test_config_bad_storage_raises():
msg = re.escape("Value must be one of python|pyarrow")
with pytest.raises(ValueError, match=msg):
pd.options.mode.string_storage = "foo"
Expand All @@ -50,3 +56,51 @@ def test_constructor_not_string_type_raises(array, chunked):
)
with pytest.raises(ValueError, match=msg):
ArrowStringArray(arr)


def test_from_sequence_wrong_dtype_raises():
with pd.option_context("string_storage", "python"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")

with pd.option_context("string_storage", "pyarrow"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")

with pytest.raises(AssertionError, match=None):
ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[python]")

ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")

with pytest.raises(AssertionError, match=None):
with pd.option_context("string_storage", "python"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())

with pd.option_context("string_storage", "pyarrow"):
ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())

with pytest.raises(AssertionError, match=None):
ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python"))

ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))

with pd.option_context("string_storage", "python"):
StringArray._from_sequence(["a", None, "c"], dtype="string")

with pd.option_context("string_storage", "pyarrow"):
StringArray._from_sequence(["a", None, "c"], dtype="string")

StringArray._from_sequence(["a", None, "c"], dtype="string[python]")

with pytest.raises(AssertionError, match=None):
StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")

with pd.option_context("string_storage", "python"):
StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())

with pytest.raises(AssertionError, match=None):
with pd.option_context("string_storage", "pyarrow"):
StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())

StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python"))

with pytest.raises(AssertionError, match=None):
StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))
23 changes: 18 additions & 5 deletions pandas/tests/arrays/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
IntegerArray,
IntervalArray,
SparseArray,
StringArray,
TimedeltaArray,
)
from pandas.core.arrays import (
Expand Down Expand Up @@ -132,8 +131,16 @@
([1, None], "Int16", pd.array([1, None], dtype="Int16")),
(pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
# String
(["a", None], "string", StringArray._from_sequence(["a", None])),
(["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None])),
(
["a", None],
"string",
pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
),
(
["a", None],
pd.StringDtype(),
pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
),
# Boolean
([True, None], "boolean", BooleanArray._from_sequence([True, None])),
([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])),
Expand Down Expand Up @@ -253,8 +260,14 @@ def test_array_copy():
([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])),
([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])),
# string
(["a", "b"], StringArray._from_sequence(["a", "b"])),
(["a", None], StringArray._from_sequence(["a", None])),
(
["a", "b"],
pd.StringDtype().construct_array_type()._from_sequence(["a", "b"]),
),
(
["a", None],
pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
),
# Boolean
([True, False], BooleanArray._from_sequence([True, False])),
([True, None], BooleanArray._from_sequence([True, None])),
Expand Down
21 changes: 12 additions & 9 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def test_searchsorted(self):
assert result == 10

@pytest.mark.parametrize("box", [None, "index", "series"])
def test_searchsorted_castable_strings(self, arr1d, box, request):
def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage):
if isinstance(arr1d, DatetimeArray):
tz = arr1d.tz
ts1, ts2 = arr1d[1:3]
Expand Down Expand Up @@ -341,14 +341,17 @@ def test_searchsorted_castable_strings(self, arr1d, box, request):
):
arr.searchsorted("foo")

with pytest.raises(
TypeError,
match=re.escape(
f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
"or array of those. Got 'StringArray' instead."
),
):
arr.searchsorted([str(arr[1]), "baz"])
arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray"

with pd.option_context("string_storage", string_storage):
with pytest.raises(
TypeError,
match=re.escape(
f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
f"or array of those. Got '{arr_type}' instead."
),
):
arr.searchsorted([str(arr[1]), "baz"])

def test_getitem_near_implementation_bounds(self):
# We only check tz-naive for DTA bc the bounds are slightly different
Expand Down
22 changes: 20 additions & 2 deletions pandas/tests/series/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pandas._libs.tslibs import iNaT
import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
NA,
Categorical,
Expand Down Expand Up @@ -377,17 +378,34 @@ class TestAstypeString:
# currently no way to parse IntervalArray from a list of strings
],
)
def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request):
def test_astype_string_to_extension_dtype_roundtrip(
self, data, dtype, request, string_storage
):
if dtype == "boolean" or (
dtype in ("period[M]", "datetime64[ns]", "timedelta64[ns]") and NaT in data
):
mark = pytest.mark.xfail(
reason="TODO StringArray.astype() with missing values #GH40566"
)
request.node.add_marker(mark)

if string_storage == "pyarrow" and dtype in (
"category",
"datetime64[ns]",
"datetime64[ns, US/Eastern]",
"UInt16",
"period[M]",
):
mark = pytest.mark.xfail(
reason="TypeError: Cannot interpret ... as a data type"
)
request.node.add_marker(mark)

# GH-40351
s = Series(data, dtype=dtype)
tm.assert_series_equal(s, s.astype("string").astype(dtype))
with pd.option_context("string_storage", string_storage):
result = s.astype("string").astype(dtype)
tm.assert_series_equal(result, s)


class TestAstypeCategorical:
Expand Down
5 changes: 4 additions & 1 deletion pandas/tests/strings/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
MultiIndex,
Series,
_testing as tm,
get_option,
)
from pandas.core import strings as strings

Expand Down Expand Up @@ -128,7 +129,9 @@ def test_api_per_method(
def test_api_for_categorical(any_string_method, any_string_dtype, request):
# http://github.com/pandas-dev/pandas/issues/10661

if any_string_dtype == "string[pyarrow]":
if any_string_dtype == "string[pyarrow]" or (
any_string_dtype == "string" and get_option("string_storage") == "pyarrow"
):
# unsupported operand type(s) for +: 'ArrowStringArray' and 'str'
mark = pytest.mark.xfail(raises=TypeError, reason="Not Implemented")
request.node.add_marker(mark)
Expand Down