Source code for japandas.core.tests.test_strings

#!/usr/bin/env python
# coding: utf-8

# do not import unicode_literals here to test ASCII in Python 2.7
import unittest

import pandas as pd
import pandas.util.testing as tm


[docs]class TestStrings(unittest.TestCase):
[docs] def setUp(self): self.zhiragana_s = pd.Series([u'ぁあぃいぅうぇえぉお', u'かがきぎくぐけげこご', u'さざしじすずせぜそぞ', u'ただちぢっつづてでとど', u'なにぬねの', u'はばぱひびぴふぶぷへべぺほぼぽ', u'まみむめもゃやゅゆょよ', u'らりるれろわをんゎゐゑゕゖゔ']) self.zkatakana_s = pd.Series([u'ァアィイゥウェエォオ', u'カガキギクグケゲコゴ', u'サザシジスズセゼソゾ', u'タダチヂッツヅテデトド', u'ナニヌネノ', u'ハバパヒビピフブプヘベペホボポ', u'マミムメモャヤュユョヨ', u'ラリルレロワヲンヮヰヱヵヶヴ', u'ー・「」。、']) self.hkatakana_s = pd.Series([u'ァアィイゥウェエォオ', u'カガキギクグケゲコゴ', u'サザシジスズセゼソゾ', u'タダチヂッツヅテデトド', u'ナニヌネノ', u'ハバパヒビピフブプヘベペホボポ', u'マミムメモャヤュユョヨ', u'ラリルレロワヲンヮヰヱヵヶヴ', u'ー・「」。、']) self.zalpha_s = pd.Series([u'ABCDEFGH', u'IJKLMNOP', u'QRSTUVWXYZ', u'abcdefgh', u'ijklmnop', u'qrstuvwxyz']) self.halpha_s = pd.Series(['ABCDEFGH', 'IJKLMNOP', 'QRSTUVWXYZ', 'abcdefgh', 'ijklmnop', 'qrstuvwxyz']) self.zdigit_s = pd.Series([u'01234', u'56789']) self.hdigit_s = pd.Series(['01234', '56789']) self.zsymbol_s = pd.Series([u'!"#$%&', u''()*+,', u'-./:;<', u'=>?@[\', u']^_`~{', u'|} ']) self.hsymbol_s = pd.Series([u'!"#$%&', u"'()*+,", u'-./:;<', u'=>?@[\\', u']^_`~{', u'|} '])
[docs] def test_mapper(self): import japandas.core.strings as s self.assertEqual(len(s._KANA_MAPPER), len(s._HKANA)) self.assertEqual(len(s._ALPHA_MAPPER), len(s._ZALPHA)) self.assertEqual(len(s._DIGIT_MAPPER), len(s._ZDIGIT)) self.assertEqual(len(s._SYMBOL_MAPPER), len(s._ZSYMBOL)) self.assertEqual(len(s._reverse_dict(s._KANA_MAPPER)), len(s._HKANA)) self.assertEqual(len(s._reverse_dict(s._ALPHA_MAPPER)), len(s._ZALPHA)) self.assertEqual(len(s._reverse_dict(s._DIGIT_MAPPER)), len(s._ZDIGIT)) self.assertEqual(len(s._Z2H_SYMBOL), len(s._H2Z_SYMBOL)) self.assertEqual(len(s._Z2H_KANA), len(s._H2Z_KANA)) self.assertEqual(len(s._Z2H_ALPHA), len(s._H2Z_ALPHA)) self.assertEqual(len(s._Z2H_DIGIT), len(s._H2Z_DIGIT)) self.assertEqual(len(s._reverse_dict(s._SYMBOL_MAPPER)), len(s._ZSYMBOL))
[docs] def test_z2h(self): s = pd.Series([u'aaa', 'bbb', u'アアア', u'1', u'*']) result = s.str.z2h() expected = pd.Series(['aaa', 'bbb', u'アアア', '1', '*']) tm.assert_series_equal(result, expected) # full-width kana to half-width kana result = self.zkatakana_s.str.z2h(kana=True, alpha=False, digit=False, symbol=False) tm.assert_series_equal(result, self.hkatakana_s) result = self.zkatakana_s.str.z2h(kana=False, alpha=True, digit=False, symbol=False) tm.assert_series_equal(result, self.zkatakana_s) result = self.zkatakana_s.str.z2h(kana=False, alpha=False, digit=True, symbol=False) tm.assert_series_equal(result, self.zkatakana_s) result = self.zkatakana_s.str.z2h(kana=False, alpha=False, digit=False, symbol=True) tm.assert_series_equal(result, self.zkatakana_s) # full-width kana to half-width alpha result = self.zalpha_s.str.z2h(kana=True, alpha=False, digit=False, symbol=False) tm.assert_series_equal(result, self.zalpha_s) result = self.zalpha_s.str.z2h(kana=False, alpha=True, digit=False, symbol=False) tm.assert_series_equal(result, self.halpha_s) result = self.zalpha_s.str.z2h(kana=False, alpha=False, digit=True, symbol=False) tm.assert_series_equal(result, self.zalpha_s) result = self.zalpha_s.str.z2h(kana=False, alpha=False, digit=False, symbol=True) tm.assert_series_equal(result, self.zalpha_s) # full-width kana to half-width digit result = self.zdigit_s.str.z2h(kana=True, alpha=False, digit=False, symbol=False) tm.assert_series_equal(result, self.zdigit_s) result = self.zdigit_s.str.z2h(kana=False, alpha=True, digit=False, symbol=False) tm.assert_series_equal(result, self.zdigit_s) result = self.zdigit_s.str.z2h(kana=False, alpha=False, digit=True, symbol=False) tm.assert_series_equal(result, self.hdigit_s) result = self.zdigit_s.str.z2h(kana=False, alpha=False, digit=False, symbol=True) tm.assert_series_equal(result, self.zdigit_s) # full-width kana to half-width symbol result = self.zsymbol_s.str.z2h(kana=True, alpha=False, digit=False, symbol=False) tm.assert_series_equal(result, self.zsymbol_s) result = self.zsymbol_s.str.z2h(kana=False, alpha=True, digit=False, symbol=False) tm.assert_series_equal(result, self.zsymbol_s) result = self.zsymbol_s.str.z2h(kana=False, alpha=False, digit=True, symbol=False) tm.assert_series_equal(result, self.zsymbol_s) result = self.zsymbol_s.str.z2h(kana=False, alpha=False, digit=False, symbol=True) tm.assert_series_equal(result, self.hsymbol_s) # half-width to half-width result = self.hkatakana_s.str.z2h() tm.assert_series_equal(result, self.hkatakana_s) result = self.halpha_s.str.z2h() tm.assert_series_equal(result, self.halpha_s) result = self.hdigit_s.str.z2h() tm.assert_series_equal(result, self.hdigit_s) result = self.hsymbol_s.str.z2h() tm.assert_series_equal(result, self.hsymbol_s)
[docs] def test_h2z(self): s = pd.Series(['aaa', 'bbb', u'アアア', u'1', '*']) result = s.str.h2z() expected = pd.Series([u'aaa', u'bbb', u'アアア', u'1', u'*']) tm.assert_series_equal(result, expected) # half-width kana to full-width kana result = self.hkatakana_s.str.h2z(kana=True, alpha=False, digit=False, symbol=False) tm.assert_series_equal(result, self.zkatakana_s) result = self.hkatakana_s.str.h2z(kana=False, alpha=True, digit=False, symbol=False) tm.assert_series_equal(result, self.hkatakana_s) result = self.hkatakana_s.str.h2z(kana=False, alpha=False, digit=True, symbol=False) tm.assert_series_equal(result, self.hkatakana_s) result = self.hkatakana_s.str.h2z(kana=False, alpha=False, digit=False, symbol=True) tm.assert_series_equal(result, self.hkatakana_s) # half-width kana to full-width alpha result = self.halpha_s.str.h2z(kana=True, alpha=False, digit=False, symbol=False) tm.assert_series_equal(result, self.halpha_s) result = self.halpha_s.str.h2z(kana=False, alpha=True, digit=False, symbol=False) tm.assert_series_equal(result, self.zalpha_s) result = self.halpha_s.str.h2z(kana=False, alpha=False, digit=True, symbol=False) tm.assert_series_equal(result, self.halpha_s) result = self.halpha_s.str.h2z(kana=False, alpha=False, digit=False, symbol=True) tm.assert_series_equal(result, self.halpha_s) # half-width kana to full-width digit result = self.hdigit_s.str.h2z(kana=True, alpha=False, digit=False, symbol=False) tm.assert_series_equal(result, self.hdigit_s) result = self.hdigit_s.str.h2z(kana=False, alpha=True, digit=False, symbol=False) tm.assert_series_equal(result, self.hdigit_s) result = self.hdigit_s.str.h2z(kana=False, alpha=False, digit=True, symbol=False) tm.assert_series_equal(result, self.zdigit_s) result = self.hdigit_s.str.h2z(kana=False, alpha=False, digit=False, symbol=True) tm.assert_series_equal(result, self.hdigit_s) # half-width kana to full-width symbol result = self.hsymbol_s.str.h2z(kana=True, alpha=False, digit=False, symbol=False) tm.assert_series_equal(result, self.hsymbol_s) result = self.hsymbol_s.str.h2z(kana=False, alpha=True, digit=False, symbol=False) tm.assert_series_equal(result, self.hsymbol_s) result = self.hsymbol_s.str.h2z(kana=False, alpha=False, digit=True, symbol=False) tm.assert_series_equal(result, self.hsymbol_s) result = self.hsymbol_s.str.h2z(kana=False, alpha=False, digit=False, symbol=True) tm.assert_series_equal(result, self.zsymbol_s) # full-width to full-width result = self.zkatakana_s.str.h2z() tm.assert_series_equal(result, self.zkatakana_s) result = self.zalpha_s.str.h2z() tm.assert_series_equal(result, self.zalpha_s) result = self.zdigit_s.str.h2z() tm.assert_series_equal(result, self.zdigit_s) result = self.zkatakana_s.str.h2z() tm.assert_series_equal(result, self.zkatakana_s)
[docs] def test_z2h_obj(self): s = pd.Series(['aaa', None, u'アアア', u'あああ', u'1', 3]) result = s.str.z2h() expected = pd.Series(['aaa', None, u'アアア', u'あああ', '1', None]) tm.assert_series_equal(result, expected) empty_str = pd.Series(dtype=str) tm.assert_series_equal(empty_str.str.h2z(), empty_str)
[docs] def test_h2z_obj(self): s = pd.Series(['aaa', None, u'アアア', u'あああ', u'1', 3]) result = s.str.h2z() expected = pd.Series([u'aaa', None, u'アアア', u'あああ', u'1', None]) tm.assert_series_equal(result, expected) empty_str = pd.Series(dtype=str) tm.assert_series_equal(empty_str.str.h2z(), empty_str)
[docs] def test_normalize(self): s = pd.Series([u'aaa', 'bbb', u'アアア', u'1', u'*']) result = s.str.normalize('NFKC') expected = pd.Series(['aaa', 'bbb', u'アアア', '1', '*']) tm.assert_series_equal(result, expected) s = pd.Series([u'aaa', None, 'bbb', u'アアア', u'1', 5, u'*']) result = s.str.normalize('NFKC') expected = pd.Series(['aaa', None, 'bbb', u'アアア', '1', None, '*']) tm.assert_series_equal(result, expected) empty_str = pd.Series(dtype=str) tm.assert_series_equal(empty_str.str.normalize('NFKC'), empty_str)
[docs] def test_normalize_format(self): import unicodedata values = [u'アイウエオ', u'カキクケコ', u'ガギグゲゴ', u'ABCDE'] for format in ['NFD', 'NFC', 'NFKD', 'NFKC']: result = pd.Series(values).str.normalize(format).tolist() expected = [unicodedata.normalize(format, v) for v in values] self.assertEqual(result, expected)
if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)