fixup pd.array and more testing of string_storage option

pandas-dev · jorisvandenbossche · Jun 8, 2021 · Jul 10, 2020 · Sep 3, 2020 · Feb 18, 2021
commit a6d066ca43f44879f4a01c74c805b2bf4b0790b7
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1131,6 +1131,22 @@ def nullable_string_dtype(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        "python",
+        pytest.param("pyarrow", marks=td.skip_if_no("pyarrow", min_version="1.0.0")),
+    ]
+)
+def string_storage(request):
+    """
+    Parametrized fixture for pd.options.mode.string_storage.
+
+    * 'python'
+    * 'pyarrow'
+    """
+    return request.param
+
+
 @pytest.fixture(params=tm.BYTES_DTYPES)
 def bytes_dtype(request):
     """

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -295,7 +295,7 @@ def __init__(self, values, copy=False):
         super().__init__(values, copy=copy)
         # error: Incompatible types in assignment (expression has type "StringDtype",
         # variable has type "PandasDtype")
-        NDArrayBacked.__init__(self, self._ndarray, StringDtype())
+        NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
         if not isinstance(values, type(self)):
             self._validate()
 
@@ -311,8 +311,9 @@ def _validate(self):
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
-        if dtype:
-            assert dtype == "string"
+        if dtype and not (isinstance(dtype, str) and dtype == "string"):
+            dtype = pandas_dtype(dtype)
+            assert isinstance(dtype, StringDtype) and dtype.storage == "python"
 
         from pandas.core.arrays.masked import BaseMaskedArray
 
@@ -332,7 +333,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
         # Manually creating new array avoids the validation step in the __init__, so is
         # faster. Refactor need for validation?
         new_string_array = cls.__new__(cls)
-        NDArrayBacked.__init__(new_string_array, result, StringDtype())
+        NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
 
         return new_string_array
 
@@ -501,7 +502,7 @@ def _str_map(
         from pandas.arrays import BooleanArray
 
         if dtype is None:
-            dtype = StringDtype()
+            dtype = StringDtype(storage="python")
         if na_value is None:
             na_value = self.dtype.na_value
 

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -35,6 +35,7 @@
     is_object_dtype,
     is_scalar,
     is_string_dtype,
+    pandas_dtype,
 )
 from pandas.core.dtypes.missing import isna
 
@@ -154,6 +155,10 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
 
         cls._chk_pyarrow_available()
 
+        if dtype and not (isinstance(dtype, str) and dtype == "string"):
+            dtype = pandas_dtype(dtype)
+            assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
+
         if isinstance(scalars, BaseMaskedArray):
             # avoid costly conversion to object dtype in ensure_string_array and
             # numerical issues with Float32Dtype

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -113,18 +113,22 @@ def array(
 
         Currently, pandas will infer an extension dtype for sequences of
 
-        ============================== =====================================
+        ============================== =======================================
         Scalar Type                    Array Type
-        ============================== =====================================
+        ============================== =======================================
         :class:`pandas.Interval`       :class:`pandas.arrays.IntervalArray`
         :class:`pandas.Period`         :class:`pandas.arrays.PeriodArray`
         :class:`datetime.datetime`     :class:`pandas.arrays.DatetimeArray`
         :class:`datetime.timedelta`    :class:`pandas.arrays.TimedeltaArray`
         :class:`int`                   :class:`pandas.arrays.IntegerArray`
         :class:`float`                 :class:`pandas.arrays.FloatingArray`
-        :class:`str`                   :class:`pandas.arrays.StringArray`
+        :class:`str`                   :class:`pandas.arrays.StringArray` or
+                                       :class:`pandas.arrays.ArrowStringArray`
         :class:`bool`                  :class:`pandas.arrays.BooleanArray`
-        ============================== =====================================
+        ============================== =======================================
+
+        The ExtensionArray created when the scalar type is :class:`str` is determined by
+        pd.options.mode.string_storage if the dtype is not explicitly given.
 
         For all other cases, NumPy's usual inference rules will be used.
 
@@ -240,6 +244,14 @@ def array(
     ['a', <NA>, 'c']
     Length: 3, dtype: string[python]
 
+    >>> with pd.option_context("string_storage", "pyarrow"):
+    ...     arr = pd.array(["a", None, "c"])
+    ...
+    >>> arr
+    <ArrowStringArray>
+    ['a', <NA>, 'c']
+    Length: 3, dtype: string[pyarrow]
+
     >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")])
     <PeriodArray>
     ['2000-01-01', '2000-01-01']
@@ -292,10 +304,10 @@ def array(
         IntegerArray,
         IntervalArray,
         PandasArray,
-        StringArray,
         TimedeltaArray,
         period_array,
     )
+    from pandas.core.arrays.string_ import StringDtype
 
     if lib.is_scalar(data):
         msg = f"Cannot pass scalar '{data}' to 'pandas.array'."
@@ -345,7 +357,8 @@ def array(
             return TimedeltaArray._from_sequence(data, copy=copy)
 
         elif inferred_dtype == "string":
-            return StringArray._from_sequence(data, copy=copy)
+            # StringArray/ArrowStringArray depending on pd.options.mode.string_storage
+            return StringDtype().construct_array_type()._from_sequence(data, copy=copy)
 
         elif inferred_dtype == "integer":
             return IntegerArray._from_sequence(data, copy=copy)

diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -8,27 +8,33 @@
 
 pa = pytest.importorskip("pyarrow", minversion="1.0.0")
 
+from pandas.core.arrays.string_ import (
+    StringArray,
+    StringDtype,
+)
 from pandas.core.arrays.string_arrow import ArrowStringArray
 
 
 def test_eq_all_na():
-    a = pd.array([pd.NA, pd.NA], dtype=pd.StringDtype("pyarrow"))
+    a = pd.array([pd.NA, pd.NA], dtype=StringDtype("pyarrow"))
     result = a == a
     expected = pd.array([pd.NA, pd.NA], dtype="boolean")
     tm.assert_extension_array_equal(result, expected)
 
 
-def test_config():
-    # python by default
-    assert pd.StringDtype().storage == "python"
-    arr = pd.array(["a", "b"])
-    assert arr.dtype.storage == "python"
+def test_config(string_storage):
+    with pd.option_context("string_storage", string_storage):
+        assert StringDtype().storage == string_storage
+        result = pd.array(["a", "b"])
+        assert result.dtype.storage == string_storage
 
-    with pd.option_context("mode.string_storage", "pyarrow"):
-        assert pd.StringDtype().storage == "pyarrow"
-        arr = pd.array(["a", "b"])
-        assert arr.dtype.storage == "pyarrow"
+    expected = (
+        StringDtype(string_storage).construct_array_type()._from_sequence(["a", "b"])
+    )
+    tm.assert_equal(result, expected)
 
+
+def test_config_bad_storage_raises():
     msg = re.escape("Value must be one of python|pyarrow")
     with pytest.raises(ValueError, match=msg):
         pd.options.mode.string_storage = "foo"
@@ -50,3 +56,51 @@ def test_constructor_not_string_type_raises(array, chunked):
         )
     with pytest.raises(ValueError, match=msg):
         ArrowStringArray(arr)
+
+
+def test_from_sequence_wrong_dtype_raises():
+    with pd.option_context("string_storage", "python"):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    with pd.option_context("string_storage", "pyarrow"):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    with pytest.raises(AssertionError, match=None):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[python]")
+
+    ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")
+
+    with pytest.raises(AssertionError, match=None):
+        with pd.option_context("string_storage", "python"):
+            ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    with pd.option_context("string_storage", "pyarrow"):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    with pytest.raises(AssertionError, match=None):
+        ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python"))
+
+    ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))
+
+    with pd.option_context("string_storage", "python"):
+        StringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    with pd.option_context("string_storage", "pyarrow"):
+        StringArray._from_sequence(["a", None, "c"], dtype="string")
+
+    StringArray._from_sequence(["a", None, "c"], dtype="string[python]")
+
+    with pytest.raises(AssertionError, match=None):
+        StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]")
+
+    with pd.option_context("string_storage", "python"):
+        StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    with pytest.raises(AssertionError, match=None):
+        with pd.option_context("string_storage", "pyarrow"):
+            StringArray._from_sequence(["a", None, "c"], dtype=StringDtype())
+
+    StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python"))
+
+    with pytest.raises(AssertionError, match=None):
+        StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow"))
diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
@@ -18,7 +18,6 @@
     IntegerArray,
     IntervalArray,
     SparseArray,
-    StringArray,
     TimedeltaArray,
 )
 from pandas.core.arrays import (
@@ -132,8 +131,16 @@
         ([1, None], "Int16", pd.array([1, None], dtype="Int16")),
         (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
         # String
-        (["a", None], "string", StringArray._from_sequence(["a", None])),
-        (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None])),
+        (
+            ["a", None],
+            "string",
+            pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
+        ),
+        (
+            ["a", None],
+            pd.StringDtype(),
+            pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
+        ),
         # Boolean
         ([True, None], "boolean", BooleanArray._from_sequence([True, None])),
         ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])),
@@ -253,8 +260,14 @@ def test_array_copy():
         ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0])),
         ([1, np.nan, 2.0], FloatingArray._from_sequence([1.0, None, 2.0])),
         # string
-        (["a", "b"], StringArray._from_sequence(["a", "b"])),
-        (["a", None], StringArray._from_sequence(["a", None])),
+        (
+            ["a", "b"],
+            pd.StringDtype().construct_array_type()._from_sequence(["a", "b"]),
+        ),
+        (
+            ["a", None],
+            pd.StringDtype().construct_array_type()._from_sequence(["a", None]),
+        ),
         # Boolean
         ([True, False], BooleanArray._from_sequence([True, False])),
         ([True, None], BooleanArray._from_sequence([True, None])),

diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
@@ -298,7 +298,7 @@ def test_searchsorted(self):
             assert result == 10
 
     @pytest.mark.parametrize("box", [None, "index", "series"])
-    def test_searchsorted_castable_strings(self, arr1d, box, request):
+    def test_searchsorted_castable_strings(self, arr1d, box, request, string_storage):
         if isinstance(arr1d, DatetimeArray):
             tz = arr1d.tz
             ts1, ts2 = arr1d[1:3]
@@ -341,14 +341,17 @@ def test_searchsorted_castable_strings(self, arr1d, box, request):
         ):
             arr.searchsorted("foo")
 
-        with pytest.raises(
-            TypeError,
-            match=re.escape(
-                f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
-                "or array of those. Got 'StringArray' instead."
-            ),
-        ):
-            arr.searchsorted([str(arr[1]), "baz"])
+        arr_type = "StringArray" if string_storage == "python" else "ArrowStringArray"
+
+        with pd.option_context("string_storage", string_storage):
+            with pytest.raises(
+                TypeError,
+                match=re.escape(
+                    f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
+                    f"or array of those. Got '{arr_type}' instead."
+                ),
+            ):
+                arr.searchsorted([str(arr[1]), "baz"])
 
     def test_getitem_near_implementation_bounds(self):
         # We only check tz-naive for DTA bc the bounds are slightly different

diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
@@ -12,6 +12,7 @@
 from pandas._libs.tslibs import iNaT
 import pandas.util._test_decorators as td
 
+import pandas as pd
 from pandas import (
     NA,
     Categorical,
@@ -377,17 +378,34 @@ class TestAstypeString:
             # currently no way to parse IntervalArray from a list of strings
         ],
     )
-    def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request):
+    def test_astype_string_to_extension_dtype_roundtrip(
+        self, data, dtype, request, string_storage
+    ):
         if dtype == "boolean" or (
             dtype in ("period[M]", "datetime64[ns]", "timedelta64[ns]") and NaT in data
         ):
             mark = pytest.mark.xfail(
                 reason="TODO StringArray.astype() with missing values #GH40566"
             )
             request.node.add_marker(mark)
+
+        if string_storage == "pyarrow" and dtype in (
+            "category",
+            "datetime64[ns]",
+            "datetime64[ns, US/Eastern]",
+            "UInt16",
+            "period[M]",
+        ):
+            mark = pytest.mark.xfail(
+                reason="TypeError: Cannot interpret ... as a data type"
+            )
+            request.node.add_marker(mark)
+
         # GH-40351
         s = Series(data, dtype=dtype)
-        tm.assert_series_equal(s, s.astype("string").astype(dtype))
+        with pd.option_context("string_storage", string_storage):
+            result = s.astype("string").astype(dtype)
+        tm.assert_series_equal(result, s)
 
 
 class TestAstypeCategorical:

diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py
@@ -6,6 +6,7 @@
     MultiIndex,
     Series,
     _testing as tm,
+    get_option,
 )
 from pandas.core import strings as strings
 
@@ -128,7 +129,9 @@ def test_api_per_method(
 def test_api_for_categorical(any_string_method, any_string_dtype, request):
     # http://github.com/pandas-dev/pandas/issues/10661
 
-    if any_string_dtype == "string[pyarrow]":
+    if any_string_dtype == "string[pyarrow]" or (
+        any_string_dtype == "string" and get_option("string_storage") == "pyarrow"
+    ):
         # unsupported operand type(s) for +: 'ArrowStringArray' and 'str'
         mark = pytest.mark.xfail(raises=TypeError, reason="Not Implemented")
         request.node.add_marker(mark)