python处理json文件(Yelp数据集)

python脚本处理yelp数据集

import sys
import json
import re
import os
import time

if __name__ == '__main__':
    
    dataset_sizes = {'sample': (40, 10), 'small': (1280, 320), 'medium': (32000, 8000), 'large': (2000000, 400000)}
    dataset_name = sys.argv[1]
    num_train, num_test = dataset_sizes[dataset_name]

    os.chdir('yelp_dataset')
    os.system('head -n ' + str(num_train + num_test) + ' review.json > temp.json')
    os.system('mv temp.json ..')
    os.chdir('..')

    with open('temp.json', 'r') as f:
        dataset = f.read().split('
')
        dataset.remove('')

    print("Done loading in dataset")

    try:
        os.mkdir('datasets')
    except:
        pass
    os.chdir('datasets')
    try:
        os.mkdir(dataset_name)
    except:
        pass
    os.chdir(dataset_name)

    train = open('train.txt', 'w')
    test = open('test.txt', 'w')

    reviews = [json.loads(review) for review in dataset]

    count = 1
    total = len(reviews)
    for review in reviews:
        review_id = review['review_id']

        num_stars = review['stars']
        if num_stars > 4.0:
            num_stars =  5
        elif num_stars > 2.0:
            num_stars =  3
        else:
            num_stars =  1

        review_text = review['text']
        review_text = re.sub(r'[^ws]','', review_text).replace("
", "").replace("
", "").lower()
        if num_train > 0:
            train.write(str(review_id) + ' ' + str(num_stars) + ' ' + str(review_text))
            train.write('
')
            num_train -= 1
        elif num_test > 0:
            test.write(str(review_id) + ' ' + str(num_stars) + ' ' + str(review_text))
            test.write('
')
            num_test -= 1

        bar_len = 30
        filled_len = int(round(bar_len * count / float(total)))

        percents = round(100.0 * count / float(total), 1)
        bar = '=' * filled_len + '-' * (bar_len - filled_len)

        sys.stdout.write("
[%s] %s%s ... %d/%d reviews created" % (bar, percents, '%', count, total))
        sys.stdout.flush()

        count += 1

    sys.stdout.write("
")

    train.close()
    test.close()
    os.chdir('../../')
    os.system('rm -f temp.json')

原文地址:https://www.cnblogs.com/Willendless/p/12066555.html