Обработка больших данных на Python

Тонкости настройки аналитики в Telegram канале

изменения внес

Чтение csv целиком

import pandas as pd
bigdata = pd.read_csv(
  '/Users/mironovich/Downloads/bigdata.csv',
  sep = ';',
  header = 0,
  encoding = 'utf8'
  )

Слишком много оперативы

Чтение csv порциями

опция Iterator https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

import sys
from IPython.display import clear_output

chunkSize = 5*10**5
filepath = '/Users/mironovich/Downloads/bigdata.csv'

import re
import os
fileSize = os.path.getsize(filepath)
rowSize = 0

reader = pd.read_csv(
  '/Users/mironovich/Downloads/bigdata.csv',
  sep = ';',
  header = 0,
  encoding = 'utf8',
  iterator = True
  )


chunksNum = 0
stop = 0
arpu = pd.DataFrame()

go = True
while go:
    try:
        chunk = reader.get_chunk(chunkSize)
        tmp = cashbackUsers.merge(chunk, how = 'left', left_on = 'ctn', right_on = 'user_id')
        arpu = pd.concat([arpu, tmp])

        if rowSize == 0:
            rowSize = len(re.sub(r'\s+', ';', chunk.iloc[0].to_string()))
        chunksNum += 1
        if chunksNum == stop:
            go = False

    except Exception as e:
        print(e)
        go = False

arpu.head()

Кажется, все повисло

Прогресс бар

import sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length = 80
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

chunkSize = 5*10**5
filepath = '/Users/mironovich/Downloads/test_002.csv'

import re
import os
fileSize = os.path.getsize(filepath)
rowSize = 0

reader = pd.read_csv(filepath,\
                             sep='\t', header=0, encoding='utf8', dtype='str',\
                             iterator = True)


chunksNum = 0
stop = 0
arpu = pd.DataFrame()

go = True
while go:
    try:
        chunk = reader.get_chunk(chunkSize)
        tmp = cashbackUsers.merge(chunk, how = 'left', left_on = 'ctn', right_on = 'user_id')
        arpu = pd.concat([arpu, tmp])

        if rowSize == 0:
            rowSize = len(re.sub(r'\s+', ';', chunk.iloc[0].to_string()))
        chunksNum += 1
        update_progress(rowSize * chunksNum * chunkSize / fileSize)
        if chunksNum == stop:
            go = False

    except Exception as e:
        print(e)
        go = False
        update_progress(1)

arpu.head()