#!/usr/bin/env python
# coding: utf-8
# do not import unicode_literals here to test ASCII in Python 2.7
import unittest
import pandas as pd
import pandas.util.testing as tm
[docs]class TestStrings(unittest.TestCase):
[docs] def setUp(self):
self.zhiragana_s = pd.Series([u'ぁあぃいぅうぇえぉお',
u'かがきぎくぐけげこご',
u'さざしじすずせぜそぞ',
u'ただちぢっつづてでとど',
u'なにぬねの',
u'はばぱひびぴふぶぷへべぺほぼぽ',
u'まみむめもゃやゅゆょよ',
u'らりるれろわをんゎゐゑゕゖゔ'])
self.zkatakana_s = pd.Series([u'ァアィイゥウェエォオ',
u'カガキギクグケゲコゴ',
u'サザシジスズセゼソゾ',
u'タダチヂッツヅテデトド',
u'ナニヌネノ',
u'ハバパヒビピフブプヘベペホボポ',
u'マミムメモャヤュユョヨ',
u'ラリルレロワヲンヮヰヱヵヶヴ',
u'ー・「」。、'])
self.hkatakana_s = pd.Series([u'ァアィイゥウェエォオ',
u'カガキギクグケゲコゴ',
u'サザシジスズセゼソゾ',
u'タダチヂッツヅテデトド',
u'ナニヌネノ',
u'ハバパヒビピフブプヘベペホボポ',
u'マミムメモャヤュユョヨ',
u'ラリルレロワヲンヮヰヱヵヶヴ',
u'ー・「」。、'])
self.zalpha_s = pd.Series([u'ABCDEFGH',
u'IJKLMNOP',
u'QRSTUVWXYZ',
u'abcdefgh',
u'ijklmnop',
u'qrstuvwxyz'])
self.halpha_s = pd.Series(['ABCDEFGH',
'IJKLMNOP',
'QRSTUVWXYZ',
'abcdefgh',
'ijklmnop',
'qrstuvwxyz'])
self.zdigit_s = pd.Series([u'01234', u'56789'])
self.hdigit_s = pd.Series(['01234', '56789'])
self.zsymbol_s = pd.Series([u'!"#$%&',
u''()*+,',
u'-./:;<',
u'=>?@[\',
u']^_`~{',
u'|} '])
self.hsymbol_s = pd.Series([u'!"#$%&',
u"'()*+,",
u'-./:;<',
u'=>?@[\\',
u']^_`~{',
u'|} '])
[docs] def test_mapper(self):
import japandas.core.strings as s
self.assertEqual(len(s._KANA_MAPPER), len(s._HKANA))
self.assertEqual(len(s._ALPHA_MAPPER), len(s._ZALPHA))
self.assertEqual(len(s._DIGIT_MAPPER), len(s._ZDIGIT))
self.assertEqual(len(s._SYMBOL_MAPPER), len(s._ZSYMBOL))
self.assertEqual(len(s._reverse_dict(s._KANA_MAPPER)), len(s._HKANA))
self.assertEqual(len(s._reverse_dict(s._ALPHA_MAPPER)), len(s._ZALPHA))
self.assertEqual(len(s._reverse_dict(s._DIGIT_MAPPER)), len(s._ZDIGIT))
self.assertEqual(len(s._Z2H_SYMBOL), len(s._H2Z_SYMBOL))
self.assertEqual(len(s._Z2H_KANA), len(s._H2Z_KANA))
self.assertEqual(len(s._Z2H_ALPHA), len(s._H2Z_ALPHA))
self.assertEqual(len(s._Z2H_DIGIT), len(s._H2Z_DIGIT))
self.assertEqual(len(s._reverse_dict(s._SYMBOL_MAPPER)), len(s._ZSYMBOL))
[docs] def test_z2h(self):
s = pd.Series([u'aaa', 'bbb', u'アアア', u'1', u'*'])
result = s.str.z2h()
expected = pd.Series(['aaa', 'bbb', u'アアア', '1', '*'])
tm.assert_series_equal(result, expected)
# full-width kana to half-width kana
result = self.zkatakana_s.str.z2h(kana=True, alpha=False, digit=False, symbol=False)
tm.assert_series_equal(result, self.hkatakana_s)
result = self.zkatakana_s.str.z2h(kana=False, alpha=True, digit=False, symbol=False)
tm.assert_series_equal(result, self.zkatakana_s)
result = self.zkatakana_s.str.z2h(kana=False, alpha=False, digit=True, symbol=False)
tm.assert_series_equal(result, self.zkatakana_s)
result = self.zkatakana_s.str.z2h(kana=False, alpha=False, digit=False, symbol=True)
tm.assert_series_equal(result, self.zkatakana_s)
# full-width kana to half-width alpha
result = self.zalpha_s.str.z2h(kana=True, alpha=False, digit=False, symbol=False)
tm.assert_series_equal(result, self.zalpha_s)
result = self.zalpha_s.str.z2h(kana=False, alpha=True, digit=False, symbol=False)
tm.assert_series_equal(result, self.halpha_s)
result = self.zalpha_s.str.z2h(kana=False, alpha=False, digit=True, symbol=False)
tm.assert_series_equal(result, self.zalpha_s)
result = self.zalpha_s.str.z2h(kana=False, alpha=False, digit=False, symbol=True)
tm.assert_series_equal(result, self.zalpha_s)
# full-width kana to half-width digit
result = self.zdigit_s.str.z2h(kana=True, alpha=False, digit=False, symbol=False)
tm.assert_series_equal(result, self.zdigit_s)
result = self.zdigit_s.str.z2h(kana=False, alpha=True, digit=False, symbol=False)
tm.assert_series_equal(result, self.zdigit_s)
result = self.zdigit_s.str.z2h(kana=False, alpha=False, digit=True, symbol=False)
tm.assert_series_equal(result, self.hdigit_s)
result = self.zdigit_s.str.z2h(kana=False, alpha=False, digit=False, symbol=True)
tm.assert_series_equal(result, self.zdigit_s)
# full-width kana to half-width symbol
result = self.zsymbol_s.str.z2h(kana=True, alpha=False, digit=False, symbol=False)
tm.assert_series_equal(result, self.zsymbol_s)
result = self.zsymbol_s.str.z2h(kana=False, alpha=True, digit=False, symbol=False)
tm.assert_series_equal(result, self.zsymbol_s)
result = self.zsymbol_s.str.z2h(kana=False, alpha=False, digit=True, symbol=False)
tm.assert_series_equal(result, self.zsymbol_s)
result = self.zsymbol_s.str.z2h(kana=False, alpha=False, digit=False, symbol=True)
tm.assert_series_equal(result, self.hsymbol_s)
# half-width to half-width
result = self.hkatakana_s.str.z2h()
tm.assert_series_equal(result, self.hkatakana_s)
result = self.halpha_s.str.z2h()
tm.assert_series_equal(result, self.halpha_s)
result = self.hdigit_s.str.z2h()
tm.assert_series_equal(result, self.hdigit_s)
result = self.hsymbol_s.str.z2h()
tm.assert_series_equal(result, self.hsymbol_s)
[docs] def test_h2z(self):
s = pd.Series(['aaa', 'bbb', u'アアア', u'1', '*'])
result = s.str.h2z()
expected = pd.Series([u'aaa', u'bbb', u'アアア', u'1', u'*'])
tm.assert_series_equal(result, expected)
# half-width kana to full-width kana
result = self.hkatakana_s.str.h2z(kana=True, alpha=False, digit=False, symbol=False)
tm.assert_series_equal(result, self.zkatakana_s)
result = self.hkatakana_s.str.h2z(kana=False, alpha=True, digit=False, symbol=False)
tm.assert_series_equal(result, self.hkatakana_s)
result = self.hkatakana_s.str.h2z(kana=False, alpha=False, digit=True, symbol=False)
tm.assert_series_equal(result, self.hkatakana_s)
result = self.hkatakana_s.str.h2z(kana=False, alpha=False, digit=False, symbol=True)
tm.assert_series_equal(result, self.hkatakana_s)
# half-width kana to full-width alpha
result = self.halpha_s.str.h2z(kana=True, alpha=False, digit=False, symbol=False)
tm.assert_series_equal(result, self.halpha_s)
result = self.halpha_s.str.h2z(kana=False, alpha=True, digit=False, symbol=False)
tm.assert_series_equal(result, self.zalpha_s)
result = self.halpha_s.str.h2z(kana=False, alpha=False, digit=True, symbol=False)
tm.assert_series_equal(result, self.halpha_s)
result = self.halpha_s.str.h2z(kana=False, alpha=False, digit=False, symbol=True)
tm.assert_series_equal(result, self.halpha_s)
# half-width kana to full-width digit
result = self.hdigit_s.str.h2z(kana=True, alpha=False, digit=False, symbol=False)
tm.assert_series_equal(result, self.hdigit_s)
result = self.hdigit_s.str.h2z(kana=False, alpha=True, digit=False, symbol=False)
tm.assert_series_equal(result, self.hdigit_s)
result = self.hdigit_s.str.h2z(kana=False, alpha=False, digit=True, symbol=False)
tm.assert_series_equal(result, self.zdigit_s)
result = self.hdigit_s.str.h2z(kana=False, alpha=False, digit=False, symbol=True)
tm.assert_series_equal(result, self.hdigit_s)
# half-width kana to full-width symbol
result = self.hsymbol_s.str.h2z(kana=True, alpha=False, digit=False, symbol=False)
tm.assert_series_equal(result, self.hsymbol_s)
result = self.hsymbol_s.str.h2z(kana=False, alpha=True, digit=False, symbol=False)
tm.assert_series_equal(result, self.hsymbol_s)
result = self.hsymbol_s.str.h2z(kana=False, alpha=False, digit=True, symbol=False)
tm.assert_series_equal(result, self.hsymbol_s)
result = self.hsymbol_s.str.h2z(kana=False, alpha=False, digit=False, symbol=True)
tm.assert_series_equal(result, self.zsymbol_s)
# full-width to full-width
result = self.zkatakana_s.str.h2z()
tm.assert_series_equal(result, self.zkatakana_s)
result = self.zalpha_s.str.h2z()
tm.assert_series_equal(result, self.zalpha_s)
result = self.zdigit_s.str.h2z()
tm.assert_series_equal(result, self.zdigit_s)
result = self.zkatakana_s.str.h2z()
tm.assert_series_equal(result, self.zkatakana_s)
[docs] def test_z2h_obj(self):
s = pd.Series(['aaa', None, u'アアア', u'あああ', u'1', 3])
result = s.str.z2h()
expected = pd.Series(['aaa', None, u'アアア', u'あああ', '1', None])
tm.assert_series_equal(result, expected)
empty_str = pd.Series(dtype=str)
tm.assert_series_equal(empty_str.str.h2z(), empty_str)
[docs] def test_h2z_obj(self):
s = pd.Series(['aaa', None, u'アアア', u'あああ', u'1', 3])
result = s.str.h2z()
expected = pd.Series([u'aaa', None, u'アアア', u'あああ', u'1', None])
tm.assert_series_equal(result, expected)
empty_str = pd.Series(dtype=str)
tm.assert_series_equal(empty_str.str.h2z(), empty_str)
[docs] def test_normalize(self):
s = pd.Series([u'aaa', 'bbb', u'アアア', u'1', u'*'])
result = s.str.normalize('NFKC')
expected = pd.Series(['aaa', 'bbb', u'アアア', '1', '*'])
tm.assert_series_equal(result, expected)
s = pd.Series([u'aaa', None, 'bbb', u'アアア', u'1', 5, u'*'])
result = s.str.normalize('NFKC')
expected = pd.Series(['aaa', None, 'bbb', u'アアア', '1', None, '*'])
tm.assert_series_equal(result, expected)
empty_str = pd.Series(dtype=str)
tm.assert_series_equal(empty_str.str.normalize('NFKC'), empty_str)
if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)