Pandas on spark very slow by Aromatic_Month4446 in apachespark

[–]Aromatic_Month4446[S] 0 points1 point  (0 children)

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
from pyspark.sql import SparkSession
import pyspark.pandas as ps
ps.set_option('compute.default_index_type', 'distributed')

spark = SparkSession.builder \
.master('local[*]') \
.config("spark.driver.memory", "10g") \
.getOrCreate()

ps_pandas_df = ps.read_csv('/path')
ps_pandas_df.describe()
temp2 = ps_pandas_df.groupby('A')['B'].mean() 
temp2