Current File : //usr/local/lib64/python3.6/site-packages/pandas/tests/io/formats/test_info.py |
from io import StringIO
import re
from string import ascii_uppercase as uppercase
import sys
import textwrap
import numpy as np
import pytest
from pandas.compat import IS64, PYPY
from pandas import (
CategoricalIndex,
DataFrame,
MultiIndex,
Series,
date_range,
option_context,
reset_option,
set_option,
)
import pandas._testing as tm
@pytest.fixture
def datetime_frame():
"""
Fixture for DataFrame of floats with DatetimeIndex
Columns are ['A', 'B', 'C', 'D']
A B C D
2000-01-03 -1.122153 0.468535 0.122226 1.693711
2000-01-04 0.189378 0.486100 0.007864 -1.216052
2000-01-05 0.041401 -0.835752 -0.035279 -0.414357
2000-01-06 0.430050 0.894352 0.090719 0.036939
2000-01-07 -0.620982 -0.668211 -0.706153 1.466335
2000-01-10 -0.752633 0.328434 -0.815325 0.699674
2000-01-11 -2.236969 0.615737 -0.829076 -1.196106
... ... ... ... ...
2000-02-03 1.642618 -0.579288 0.046005 1.385249
2000-02-04 -0.544873 -1.160962 -0.284071 -1.418351
2000-02-07 -2.656149 -0.601387 1.410148 0.444150
2000-02-08 -1.201881 -1.289040 0.772992 -1.445300
2000-02-09 1.377373 0.398619 1.008453 -0.928207
2000-02-10 0.473194 -0.636677 0.984058 0.511519
2000-02-11 -0.965556 0.408313 -1.312844 -0.381948
[30 rows x 4 columns]
"""
return DataFrame(tm.getTimeSeriesData())
def test_info_categorical_column():
# make sure it works
n = 2500
df = DataFrame({"int64": np.random.randint(100, size=n)})
df["category"] = Series(
np.array(list("abcdefghij")).take(np.random.randint(0, 10, size=n))
).astype("category")
df.isna()
buf = StringIO()
df.info(buf=buf)
df2 = df[df["category"] == "d"]
buf = StringIO()
df2.info(buf=buf)
def test_info(float_frame, datetime_frame):
io = StringIO()
float_frame.info(buf=io)
datetime_frame.info(buf=io)
frame = DataFrame(np.random.randn(5, 3))
frame.info()
frame.info(verbose=False)
def test_info_verbose():
buf = StringIO()
size = 1001
start = 5
frame = DataFrame(np.random.randn(3, size))
frame.info(verbose=True, buf=buf)
res = buf.getvalue()
header = " # Column Dtype \n--- ------ ----- "
assert header in res
frame.info(verbose=True, buf=buf)
buf.seek(0)
lines = buf.readlines()
assert len(lines) > 0
for i, line in enumerate(lines):
if i >= start and i < start + size:
line_nr = f" {i - start} "
assert line.startswith(line_nr)
def test_info_memory():
# https://github.com/pandas-dev/pandas/issues/21056
df = DataFrame({"a": Series([1, 2], dtype="i8")})
buf = StringIO()
df.info(buf=buf)
result = buf.getvalue()
bytes = float(df.memory_usage().sum())
expected = textwrap.dedent(
f"""\
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 a 2 non-null int64
dtypes: int64(1)
memory usage: {bytes} bytes
"""
)
assert result == expected
def test_info_wide():
io = StringIO()
df = DataFrame(np.random.randn(5, 101))
df.info(buf=io)
io = StringIO()
df.info(buf=io, max_cols=101)
rs = io.getvalue()
assert len(rs.splitlines()) > 100
xp = rs
set_option("display.max_info_columns", 101)
io = StringIO()
df.info(buf=io)
assert rs == xp
reset_option("display.max_info_columns")
def test_info_duplicate_columns():
io = StringIO()
# it works!
frame = DataFrame(np.random.randn(1500, 4), columns=["a", "a", "b", "b"])
frame.info(buf=io)
def test_info_duplicate_columns_shows_correct_dtypes():
# GH11761
io = StringIO()
frame = DataFrame([[1, 2.0]], columns=["a", "a"])
frame.info(buf=io)
io.seek(0)
lines = io.readlines()
assert " 0 a 1 non-null int64 \n" == lines[5]
assert " 1 a 1 non-null float64\n" == lines[6]
def test_info_shows_column_dtypes():
dtypes = [
"int64",
"float64",
"datetime64[ns]",
"timedelta64[ns]",
"complex128",
"object",
"bool",
]
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
df.info(buf=buf)
res = buf.getvalue()
header = (
" # Column Non-Null Count Dtype \n"
"--- ------ -------------- ----- "
)
assert header in res
for i, dtype in enumerate(dtypes):
name = f" {i:d} {i:d} {n:d} non-null {dtype}"
assert name in res
def test_info_max_cols():
df = DataFrame(np.random.randn(10, 5))
for len_, verbose in [(5, None), (5, False), (12, True)]:
# For verbose always ^ setting ^ summarize ^ full output
with option_context("max_info_columns", 4):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
for len_, verbose in [(12, None), (5, False), (12, True)]:
# max_cols not exceeded
with option_context("max_info_columns", 5):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
for len_, max_cols in [(12, 5), (5, 4)]:
# setting truncates
with option_context("max_info_columns", 4):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
# setting wouldn't truncate
with option_context("max_info_columns", 5):
buf = StringIO()
df.info(buf=buf, max_cols=max_cols)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_
def test_info_memory_usage():
# Ensure memory usage is displayed, when asserted, on the last line
dtypes = [
"int64",
"float64",
"datetime64[ns]",
"timedelta64[ns]",
"complex128",
"object",
"bool",
]
data = {}
n = 10
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
buf = StringIO()
# display memory usage case
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert "memory usage: " in res[-1]
# do not display memory usage case
df.info(buf=buf, memory_usage=False)
res = buf.getvalue().splitlines()
assert "memory usage: " not in res[-1]
df.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# memory usage is a lower bound, so print it as XYZ+ MB
assert re.match(r"memory usage: [^+]+\+", res[-1])
df.iloc[:, :5].info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
# excluded column with object dtype, so estimate is accurate
assert not re.match(r"memory usage: [^+]+\+", res[-1])
# Test a DataFrame with duplicate columns
dtypes = ["int64", "int64", "int64", "float64"]
data = {}
n = 100
for i, dtype in enumerate(dtypes):
data[i] = np.random.randint(2, size=n).astype(dtype)
df = DataFrame(data)
df.columns = dtypes
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
df_with_object_index.info(buf=buf, memory_usage=True)
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+\+", res[-1])
df_with_object_index.info(buf=buf, memory_usage="deep")
res = buf.getvalue().splitlines()
assert re.match(r"memory usage: [^+]+$", res[-1])
# Ensure df size is as expected
# (cols * rows * bytes) + index size
df_size = df.memory_usage().sum()
exp_size = len(dtypes) * n * 8 + df.index.nbytes
assert df_size == exp_size
# Ensure number of cols in memory_usage is the same as df
size_df = np.size(df.columns.values) + 1 # index=True; default
assert size_df == np.size(df.memory_usage())
# assert deep works only on object
assert df.memory_usage().sum() == df.memory_usage(deep=True).sum()
# test for validity
DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
DataFrame(1, index=["a"], columns=["A"]).index.nbytes
df = DataFrame(
data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"],
)
df.index.nbytes
df.memory_usage(index=True)
df.index.values.nbytes
mem = df.memory_usage(deep=True).sum()
assert mem > 0
@pytest.mark.skipif(PYPY, reason="on PyPy deep=True doesn't change result")
def test_info_memory_usage_deep_not_pypy():
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
assert (
df_with_object_index.memory_usage(index=True, deep=True).sum()
> df_with_object_index.memory_usage(index=True).sum()
)
df_object = DataFrame({"a": ["a"]})
assert df_object.memory_usage(deep=True).sum() > df_object.memory_usage().sum()
@pytest.mark.skipif(not PYPY, reason="on PyPy deep=True does not change result")
def test_info_memory_usage_deep_pypy():
df_with_object_index = DataFrame({"a": [1]}, index=["foo"])
assert (
df_with_object_index.memory_usage(index=True, deep=True).sum()
== df_with_object_index.memory_usage(index=True).sum()
)
df_object = DataFrame({"a": ["a"]})
assert df_object.memory_usage(deep=True).sum() == df_object.memory_usage().sum()
@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
def test_usage_via_getsizeof():
df = DataFrame(
data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"],
)
mem = df.memory_usage(deep=True).sum()
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = mem - sys.getsizeof(df)
assert abs(diff) < 100
def test_info_memory_usage_qualified():
buf = StringIO()
df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
df.info(buf=buf)
assert "+" not in buf.getvalue()
buf = StringIO()
df = DataFrame(1, columns=list("ab"), index=list("ABC"))
df.info(buf=buf)
assert "+" in buf.getvalue()
buf = StringIO()
df = DataFrame(
1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]),
)
df.info(buf=buf)
assert "+" not in buf.getvalue()
buf = StringIO()
df = DataFrame(
1,
columns=list("ab"),
index=MultiIndex.from_product([range(3), ["foo", "bar"]]),
)
df.info(buf=buf)
assert "+" in buf.getvalue()
def test_info_memory_usage_bug_on_multiindex():
# GH 14308
# memory usage introspection should not materialize .values
def memory_usage(f):
return f.memory_usage(deep=True).sum()
N = 100
M = len(uppercase)
index = MultiIndex.from_product(
[list(uppercase), date_range("20160101", periods=N)], names=["id", "date"],
)
df = DataFrame({"value": np.random.randn(N * M)}, index=index)
unstacked = df.unstack("id")
assert df.values.nbytes == unstacked.values.nbytes
assert memory_usage(df) > memory_usage(unstacked)
# high upper bound
assert memory_usage(unstacked) - memory_usage(df) < 2000
def test_info_categorical():
# GH14298
idx = CategoricalIndex(["a", "b"])
df = DataFrame(np.zeros((2, 2)), index=idx, columns=idx)
buf = StringIO()
df.info(buf=buf)
@pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system")
def test_info_int_columns():
# GH#37245
df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
buf = StringIO()
df.info(null_counts=True, buf=buf)
result = buf.getvalue()
expected = textwrap.dedent(
"""\
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, A to B
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 1 2 non-null int64
1 2 2 non-null int64
dtypes: int64(2)
memory usage: 48.0+ bytes
"""
)
assert result == expected