Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
4cb60e6
Implement BaseDtypeTests for ArrowStringDtype
xhochy Jul 10, 2020
d242f2d
Refactor to use parametrized StringDtype
TomAugspurger Sep 3, 2020
d39ab2c
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Feb 18, 2021
2367810
abs-imports
simonjayhawkins Feb 18, 2021
9166d3b
post merge fixup
simonjayhawkins Feb 19, 2021
8760705
StringDtype[python] -> string[python]
simonjayhawkins Feb 19, 2021
d5b3fec
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 22, 2021
2c657df
pre-commit fix for inconsistent use of pandas namespace
simonjayhawkins Mar 22, 2021
647a6c2
fix typo
simonjayhawkins Mar 22, 2021
0596fd7
pre-commit fixup - undefined name 'ArrowStringDtype'
simonjayhawkins Mar 22, 2021
c5a19c5
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 26, 2021
99680c9
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 28, 2021
69a6cc1
"StringDtype[storage]" -> "string[storage]" misc
simonjayhawkins Mar 28, 2021
bd147ba
__from_arrow__
simonjayhawkins Mar 28, 2021
830275f
more testing (wip)
simonjayhawkins Mar 28, 2021
214e524
fix inference
simonjayhawkins Mar 28, 2021
c9ba03c
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 29, 2021
7425536
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 1, 2021
68ac391
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 1, 2021
5cfa97a
post-merge fixup
simonjayhawkins Apr 1, 2021
74dbf96
remove changes to test_string_dtype - broken off in #40725
simonjayhawkins Apr 1, 2021
3985943
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 15, 2021
3bda421
post merge fix-up
simonjayhawkins Apr 15, 2021
0c108a4
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 15, 2021
523e24c
post merge fix-up
simonjayhawkins Apr 15, 2021
279624c
revert some changes made for pre-commit checks.
simonjayhawkins Apr 15, 2021
80d231e
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 16, 2021
c5ced5a
post merge fix-up
simonjayhawkins Apr 16, 2021
459812c
undo unrelated changes
simonjayhawkins Apr 16, 2021
d707b6b
undo changes to imports
simonjayhawkins Apr 16, 2021
71ccf24
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 17, 2021
daaac06
StringDtype.construct_array_type - add ref to issue
simonjayhawkins Apr 17, 2021
46626d1
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 19, 2021
3677bfa
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 1, 2021
42d382f
post merge fixup
simonjayhawkins May 1, 2021
4fb1a0d
add draft release note
simonjayhawkins May 1, 2021
5d4eac1
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 12, 2021
15efb2e
post merge fix-up
simonjayhawkins May 12, 2021
b53cfe0
docstrings
simonjayhawkins May 12, 2021
b7db53f
benchmarks
simonjayhawkins May 12, 2021
3399f08
pyarrow min
simonjayhawkins May 12, 2021
e365f01
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 26, 2021
71d1e6c
post merge fixup
simonjayhawkins May 26, 2021
9e23c35
misc clean
simonjayhawkins May 26, 2021
c69a611
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 27, 2021
64b3206
update construct_from_string docstring
simonjayhawkins May 27, 2021
d83a4ff
update whatsnew for dtype="string"
simonjayhawkins May 27, 2021
ef38660
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 30, 2021
aef1162
update release note
simonjayhawkins May 30, 2021
6247a5b
paramertize test for df.convert_dtypes()
simonjayhawkins May 30, 2021
a6d066c
fixup pd.array and more testing of string_storage option
simonjayhawkins May 31, 2021
8adb08d
use string_storage fixture more
simonjayhawkins May 31, 2021
3ad0638
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 31, 2021
56714c9
post merge fixup
simonjayhawkins May 31, 2021
6a1cc2b
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Jun 2, 2021
1761a84
remove accessor methods section from release note
simonjayhawkins Jun 2, 2021
3e26baa
consistent dtype naming in benchmark
simonjayhawkins Jun 2, 2021
6b470b1
Apply suggestions from code review
simonjayhawkins Jun 2, 2021
2ec6de0
name and str() change to "string"
simonjayhawkins Jun 2, 2021
a0b7a70
remove testing of sting dtype without storage specified.
simonjayhawkins Jun 2, 2021
d9dcd20
update StringDtype docstring
simonjayhawkins Jun 2, 2021
4a37470
add ArrowStringArray to pd.arrays namespace
simonjayhawkins Jun 2, 2021
1d59c7a
add common base class, BaseStringArray
simonjayhawkins Jun 2, 2021
e57c850
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Jun 4, 2021
51f1b1d
fixup roundtrip tests
simonjayhawkins Jun 4, 2021
fc95c06
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Jun 7, 2021
ef02a43
remove link
simonjayhawkins Jun 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
name and str() change to "string"
  • Loading branch information
simonjayhawkins committed Jun 2, 2021
commit 2ec6de0446abe9ba9c33de331d97c232fb1e55ba
13 changes: 11 additions & 2 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
TimedeltaArray,
)
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
from pandas.core.arrays.string_ import StringDtype

from pandas.io.formats.printing import pprint_thing

Expand Down Expand Up @@ -638,12 +639,20 @@ def raise_assert_detail(obj, message, left, right, diff=None, index_values=None)

if isinstance(left, np.ndarray):
left = pprint_thing(left)
elif is_categorical_dtype(left) or isinstance(left, PandasDtype):
elif (
is_categorical_dtype(left)
or isinstance(left, PandasDtype)
or isinstance(left, StringDtype)
):
left = repr(left)

if isinstance(right, np.ndarray):
right = pprint_thing(right)
elif is_categorical_dtype(right) or isinstance(right, PandasDtype):
elif (
is_categorical_dtype(right)
or isinstance(right, PandasDtype)
or isinstance(right, StringDtype)
):
right = repr(right)

msg += f"""
Expand Down
13 changes: 7 additions & 6 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ class StringDtype(ExtensionDtype):
string[python]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mention that the output repr changed in the release note as well

Copy link
Member Author

@simonjayhawkins simonjayhawkins Jun 2, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with latest changes, no longer applicable in majority of cases.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

now that the name and the str() repr and name has changed, we could maybe change this back to StringDtype and add the stroage.

i.e StringDtype(storage="python")

"""

name = "string"

#: StringDtype.na_value uses pandas.NA
na_value = libmissing.NA
_metadata = ("storage",)
Expand All @@ -102,10 +104,6 @@ def __init__(self, storage=None):

self.storage = storage

@property
def name(self):
return f"string[{self.storage}]"

@property
def type(self) -> type[str]:
return str
Expand Down Expand Up @@ -182,6 +180,9 @@ def construct_array_type( # type: ignore[override]
return ArrowStringArray

def __repr__(self):
return f"string[{self.storage}]"

def __str__(self):
return self.name

def __from_arrow__(
Expand Down Expand Up @@ -268,7 +269,7 @@ class StringArray(PandasArray):
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
<StringArray>
['This is', 'some text', <NA>, 'data.']
Length: 4, dtype: string[python]
Length: 4, dtype: string

Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
will convert the values to strings.
Expand All @@ -280,7 +281,7 @@ class StringArray(PandasArray):
>>> pd.array(['1', 1], dtype="string")
<StringArray>
['1', '1']
Length: 2, dtype: string[python]
Length: 2, dtype: string

However, instantiating StringArrays directly with non-strings will raise an error.

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin):
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
<ArrowStringArray>
['This is', 'some text', <NA>, 'data.']
Length: 4, dtype: string[pyarrow]
Length: 4, dtype: string
"""

def __init__(self, values):
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,15 +239,15 @@ def array(
>>> pd.array(["a", None, "c"])
<StringArray>
['a', <NA>, 'c']
Length: 3, dtype: string[python]
Length: 3, dtype: string

>>> with pd.option_context("string_storage", "pyarrow"):
... arr = pd.array(["a", None, "c"])
...
>>> arr
<ArrowStringArray>
['a', <NA>, 'c']
Length: 3, dtype: string[pyarrow]
Length: 3, dtype: string

>>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
<PeriodArray>
Expand Down
14 changes: 7 additions & 7 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6139,12 +6139,12 @@ def convert_dtypes(
2 3 z <NA> <NA> 20 200.0

>>> dfn.dtypes
a Int32
b string[python]
c boolean
d string[python]
e Int64
f Float64
a Int32
b string
c boolean
d string
e Int64
f Float64
dtype: object

Start with a Series of strings and missing data represented by ``np.nan``.
Expand All @@ -6162,7 +6162,7 @@ def convert_dtypes(
0 a
1 b
2 <NA>
dtype: string[python]
dtype: string
"""
if self.ndim == 1:
return self._convert_dtypes(
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3080,7 +3080,7 @@ def _result_dtype(arr):
from pandas.core.arrays.string_ import StringDtype

if isinstance(arr.dtype, StringDtype):
return arr.dtype.name
return arr.dtype
else:
return object

Expand Down
8 changes: 2 additions & 6 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,11 @@ def test_repr(dtype):
expected = " A\n0 a\n1 <NA>\n2 b"
assert repr(df) == expected

expected = (
f"0 a\n1 <NA>\n2 b\nName: A, dtype: string[{dtype.storage}]"
)
expected = "0 a\n1 <NA>\n2 b\nName: A, dtype: string"
assert repr(df.A) == expected

arr_name = "ArrowStringArray" if dtype.storage == "pyarrow" else "StringArray"
expected = (
f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string[{dtype.storage}]"
)
expected = f"<{arr_name}>\n['a', <NA>, 'b']\nLength: 3, dtype: string"
assert repr(df.A.array) == expected


Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,9 @@ def data_for_grouping(dtype, chunked):


class TestDtype(base.BaseDtypeTests):
pass
def test_eq_with_str(self, dtype):
assert dtype == f"string[{dtype.storage}]"
super().test_eq_with_str(dtype)


class TestInterface(base.BaseInterfaceTests):
Expand All @@ -106,7 +108,9 @@ def test_view(self, data, request):


class TestConstructors(base.BaseConstructorsTests):
pass
def test_from_dtype(self, data):
# base test uses string representation of dtype
pass


class TestReshaping(base.BaseReshapingTests):
Expand Down