Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
4cb60e6
Implement BaseDtypeTests for ArrowStringDtype
xhochy Jul 10, 2020
d242f2d
Refactor to use parametrized StringDtype
TomAugspurger Sep 3, 2020
d39ab2c
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Feb 18, 2021
2367810
abs-imports
simonjayhawkins Feb 18, 2021
9166d3b
post merge fixup
simonjayhawkins Feb 19, 2021
8760705
StringDtype[python] -> string[python]
simonjayhawkins Feb 19, 2021
d5b3fec
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 22, 2021
2c657df
pre-commit fix for inconsistent use of pandas namespace
simonjayhawkins Mar 22, 2021
647a6c2
fix typo
simonjayhawkins Mar 22, 2021
0596fd7
pre-commit fixup - undefined name 'ArrowStringDtype'
simonjayhawkins Mar 22, 2021
c5a19c5
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 26, 2021
99680c9
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 28, 2021
69a6cc1
"StringDtype[storage]" -> "string[storage]" misc
simonjayhawkins Mar 28, 2021
bd147ba
__from_arrow__
simonjayhawkins Mar 28, 2021
830275f
more testing (wip)
simonjayhawkins Mar 28, 2021
214e524
fix inference
simonjayhawkins Mar 28, 2021
c9ba03c
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Mar 29, 2021
7425536
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 1, 2021
68ac391
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 1, 2021
5cfa97a
post-merge fixup
simonjayhawkins Apr 1, 2021
74dbf96
remove changes to test_string_dtype - broken off in #40725
simonjayhawkins Apr 1, 2021
3985943
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 15, 2021
3bda421
post merge fix-up
simonjayhawkins Apr 15, 2021
0c108a4
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 15, 2021
523e24c
post merge fix-up
simonjayhawkins Apr 15, 2021
279624c
revert some changes made for pre-commit checks.
simonjayhawkins Apr 15, 2021
80d231e
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 16, 2021
c5ced5a
post merge fix-up
simonjayhawkins Apr 16, 2021
459812c
undo unrelated changes
simonjayhawkins Apr 16, 2021
d707b6b
undo changes to imports
simonjayhawkins Apr 16, 2021
71ccf24
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 17, 2021
daaac06
StringDtype.construct_array_type - add ref to issue
simonjayhawkins Apr 17, 2021
46626d1
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Apr 19, 2021
3677bfa
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 1, 2021
42d382f
post merge fixup
simonjayhawkins May 1, 2021
4fb1a0d
add draft release note
simonjayhawkins May 1, 2021
5d4eac1
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 12, 2021
15efb2e
post merge fix-up
simonjayhawkins May 12, 2021
b53cfe0
docstrings
simonjayhawkins May 12, 2021
b7db53f
benchmarks
simonjayhawkins May 12, 2021
3399f08
pyarrow min
simonjayhawkins May 12, 2021
e365f01
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 26, 2021
71d1e6c
post merge fixup
simonjayhawkins May 26, 2021
9e23c35
misc clean
simonjayhawkins May 26, 2021
c69a611
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 27, 2021
64b3206
update construct_from_string docstring
simonjayhawkins May 27, 2021
d83a4ff
update whatsnew for dtype="string"
simonjayhawkins May 27, 2021
ef38660
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 30, 2021
aef1162
update release note
simonjayhawkins May 30, 2021
6247a5b
paramertize test for df.convert_dtypes()
simonjayhawkins May 30, 2021
a6d066c
fixup pd.array and more testing of string_storage option
simonjayhawkins May 31, 2021
8adb08d
use string_storage fixture more
simonjayhawkins May 31, 2021
3ad0638
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins May 31, 2021
56714c9
post merge fixup
simonjayhawkins May 31, 2021
6a1cc2b
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Jun 2, 2021
1761a84
remove accessor methods section from release note
simonjayhawkins Jun 2, 2021
3e26baa
consistent dtype naming in benchmark
simonjayhawkins Jun 2, 2021
6b470b1
Apply suggestions from code review
simonjayhawkins Jun 2, 2021
2ec6de0
name and str() change to "string"
simonjayhawkins Jun 2, 2021
a0b7a70
remove testing of sting dtype without storage specified.
simonjayhawkins Jun 2, 2021
d9dcd20
update StringDtype docstring
simonjayhawkins Jun 2, 2021
4a37470
add ArrowStringArray to pd.arrays namespace
simonjayhawkins Jun 2, 2021
1d59c7a
add common base class, BaseStringArray
simonjayhawkins Jun 2, 2021
e57c850
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Jun 4, 2021
51f1b1d
fixup roundtrip tests
simonjayhawkins Jun 4, 2021
fc95c06
Merge remote-tracking branch 'upstream/master' into arrow-string-arra…
simonjayhawkins Jun 7, 2021
ef02a43
remove link
simonjayhawkins Jun 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fixup roundtrip tests
  • Loading branch information
simonjayhawkins committed Jun 4, 2021
commit 51f1b1d7ce878b40826cb96d7e661aae9ab2b726
4 changes: 4 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,10 @@ def string_storage(request):
return request.param


# Alias so we can test with cartesian product of string_storage
string_storage2 = string_storage


@pytest.fixture(params=tm.BYTES_DTYPES)
def bytes_dtype(request):
"""
Expand Down
20 changes: 12 additions & 8 deletions pandas/tests/arrays/string_/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,23 +431,25 @@ def test_arrow_array(dtype):


@td.skip_if_no("pyarrow")
def test_arrow_roundtrip(dtype):
def test_arrow_roundtrip(dtype, string_storage2):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the skips should be encompassed in the fixtures themselves no? (can change this later)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this test is skipped if pyarrow not installed since the test needs pyarrow to create pa.table(df), and skips the ArrowStringArray tests from fixture if the installed version of pyarrow is < 1.0.0, so would only test the python StringDtype with the python global setting

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why aren't you just using string_storage (e.g. when you only need 1)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

string_storage parameterizes dtype, so string_storage is the same as dtype and can't test the StringArray against pyarrow global storage setting and vice-versa.

this way we have 4 tests, not 2

# roundtrip possible from arrow 1.0.0
import pyarrow as pa

data = pd.array(["a", "b", None], dtype=dtype)
df = pd.DataFrame({"a": data})
table = pa.table(df)
assert table.field("a").type == "string"
result = table.to_pandas()
assert isinstance(result["a"].dtype, type(dtype))
tm.assert_frame_equal(result, df)
with pd.option_context("string_storage", string_storage2):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage2}]")
tm.assert_frame_equal(result, expected)
# ensure the missing value is represented by NA and not np.nan or None
assert result.loc[2, "a"] is pd.NA


@td.skip_if_no("pyarrow")
def test_arrow_load_from_zero_chunks(dtype):
def test_arrow_load_from_zero_chunks(dtype, string_storage2):
# GH-41040
import pyarrow as pa

Expand All @@ -457,9 +459,11 @@ def test_arrow_load_from_zero_chunks(dtype):
assert table.field("a").type == "string"
# Instantiate the same table with no chunks at all
table = pa.table([pa.chunked_array([], type=pa.string())], schema=table.schema)
result = table.to_pandas()
assert isinstance(result["a"].dtype, type(dtype))
tm.assert_frame_equal(result, df)
with pd.option_context("string_storage", string_storage2):
result = table.to_pandas()
assert isinstance(result["a"].dtype, pd.StringDtype)
expected = df.astype(f"string[{string_storage2}]")
tm.assert_frame_equal(result, expected)


def test_value_counts_na(dtype):
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,10 +810,11 @@ def test_additional_extension_arrays(self, pa):
check_round_trip(df, pa)

@td.skip_if_no("pyarrow", min_version="1.0.0")
def test_pyarrow_backed_string_array(self, pa):
def test_pyarrow_backed_string_array(self, pa, string_storage):
# test ArrowStringArray supported through the __arrow_array__ protocol
df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
check_round_trip(df, pa, expected=df)
with pd.option_context("string_storage", string_storage):
check_round_trip(df, pa, expected=df.astype(f"string[{string_storage}]"))

@td.skip_if_no("pyarrow")
def test_additional_extension_types(self, pa):
Expand Down