1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
| # -*- coding:utf-8 -*- import numpy as np import pandas as pd import json
from unidecode import unidecode
data = [] f = open("2017-06-21-tripadvisor_restaurant_detail .txt","r") lines = f.readlines()
for line in lines: unicode(line,'utf-8') lineDic = json.loads(line) lineDic['country'] = lineDic['breadcrumbs'][1] lineDic['city'] = lineDic['breadcrumbs'][2] lineDic['breadcrumbs'] = ';'.join(lineDic['breadcrumbs']) lineDic['languages'] = ';'.join(lineDic['languages']) lineDic['season_peopele'] = ';'.join(lineDic['season_peopele']) lineDic['traveller_type'] = ';'.join(lineDic['traveller_type']) lineDic['restaurant_imgs'] = ';'.join(lineDic['restaurant_imgs']) lineDic['buiness_hours'] = ';'.join(lineDic['buiness_hours']) lineDic['rate_percent'] = json.dumps(lineDic['rate_percent']) data.append(lineDic)
dataFrame = pd.DataFrame(data) dataFrame.drop(['rate_percent', 'buiness_hours', 'breadcrumbs', 'languages', 'season_peopele', 'season_peopele', 'traveller_type', 'restaurant_imgs'], axis=1, inplace=True) df = dataFrame[30000:40000]
# df = df.applymap(illegal_char_remover) # results filename = 'tripadvisor_data_test2.xlsx' df.to_excel(filename,"Sheet1",engine="openpyxl", encoding='utf-8') print "Ok!!! the file in",filename
|