#!/usr/bin/env python
# coding: utf-8
from __future__ import unicode_literals
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
from pandas_datareader.base import _BaseReader
from japandas.tseries.tools import to_datetime
# http://www.e-stat.go.jp/api/e-stat-manual/
METADATA_MAPPER = {
# 'TABLE_INF': '統計表ID',
'STAT_NAME': '政府統計名',
'GOV_ORG': '作成機関名',
'STATISTICS_NAME': '提供統計名及び提供分類名',
'TITLE': '統計表題名及び表番号',
'CYCLE': '提供周期',
'SURVEY_DATE': '調査年月',
'OPEN_DATE': '公開日',
'SMALL_AREA': '小地域属性フラグ',
'MAIN_CATEGORY': '統計大分野名',
'SUB_CATEGORY': '統計小分野名',
'OVERALL_TOTAL_NUMBER': '総件数',
'UPDATED_DATE': '最終更新日',
'id': '統計表ID'
}
[docs]class EStatReader(_BaseReader):
def __init__(self, symbols=None, appid=None,
limit=None, startPosition=None, **kwargs):
if isinstance(symbols, pd.DataFrame):
if '統計表ID' in symbols.columns:
symbols = symbols.loc[:, '統計表ID']
else:
raise ValueError('DataFrame 中に "統計表ID" カラムがありません')
super(EStatReader, self).__init__(symbols=symbols, **kwargs)
if appid is None:
raise ValueError('アプリケーションID "appid" を文字列で指定してください')
self.appid = appid
# e-Stat attrs
self.limit = limit
self.startPosition = startPosition
@property
def url(self):
return 'http://api.e-stat.go.jp/rest/2.0/app/getStatsData'
@property
def params(self):
params = {'appId': self.appid, 'lang': 'J'}
for attr in ['limit', 'startPosition']:
value = getattr(self, attr, None)
if value is not None:
params[attr] = value
return params
[docs] def read(self):
""" read data """
if isinstance(self.symbols, pd.compat.string_types):
if len(self.symbols) == 8:
return self.get_estat_list()
params = self.params
params['statsDataId'] = self.symbols
return self._read_one_data(self.url, params)
elif pd.api.types.is_list_like(self.symbols):
dfs = []
for symbol in self.symbols:
params = self.params
params['statsDataId'] = symbol
df = self._read_one_data(self.url, params)
dfs.append(df)
if len(dfs) == 0:
raise ValueError('取得するIDがありません')
elif len(dfs) == 1:
return dfs[0]
else:
return dfs[0].append(dfs[1:])
else:
raise ValueError('IDは文字列もしくはそのリストで指定してください')
def _read_lines(self, out):
root = ET.fromstring(out.getvalue())
# retrieve class
class_names = {} # mapping from class id to name
class_codes = {} # mapping from class id to codes
for c in root.findall('.//CLASS_OBJ'):
class_id = c.attrib['id']
class_names[class_id] = c.attrib['name']
mapper = {}
for code in c.findall('CLASS'):
mapper[code.attrib['code']] = code.attrib['name']
class_codes[class_id] = mapper
# retrieve values
values = []
for value in root.findall('.//VALUE'):
row = {}
for cat in class_codes:
name = class_names[cat]
code = value.attrib[cat]
row[name] = class_codes[cat][code]
if value.text in ('-', ):
# avoid to_numeric fails
row['value'] = np.nan
else:
row['value'] = value.text
values.append(row)
df = pd.DataFrame(values)
df.loc[:, 'value'] = pd.to_numeric(df['value'], errors='ignore')
if 'time' in class_names:
df = df.set_index(class_names['time'])
df.index = to_datetime(df.index)
return df
[docs] def get_estat_list(self):
url = 'http://api.e-stat.go.jp/rest/2.0/app/getStatsList'
params = {'appId': self.appid, 'lang': 'J', 'statsCode': self.symbols}
out = self._read_url_as_StringIO(url, params=params)
root = ET.fromstring(out.getvalue())
values = []
columns = []
for table in root.findall('.//TABLE_INF'):
columns = ['統計表ID']
row = {'統計表ID': table.get('id')}
for elem in table.iter():
if elem.tag == 'TABLE_INF':
continue
if elem.tag in ('UPDATED_DATE', 'OPEN_DATE'):
val = pd.to_datetime(elem.text)
elif elem.tag == 'SURVEY_DATE':
# Almost impossible to parse SURVEY_DATE as Timestamp...
val = elem.text
elif elem.tag == 'OVERALL_TOTAL_NUMVER':
val = pd.to_numeric(elem.text)
else:
val = elem.text
label = METADATA_MAPPER.get(elem.tag, elem.tag)
columns.append(label)
row[label] = val
values.append(row)
if len(values) == 0:
try:
# if msg can be extracted from XML, raise it
root = ET.fromstring(out.getvalue())
msg = root.find('RESULT').find('ERROR_MSG').text
except Exception:
# otherwie, raise all XML content
raise ValueError(out.getvalue())
raise ValueError(msg.encode('utf-8', 'replace'))
df = pd.DataFrame(values, columns=columns)
return df