Websites
- The Blaze Ecosystem (Blaze)
- Dask: Flexible library for parallel computing in Python.
- DataShape: Data layout language for array programming.
- Odo: Shapeshifting for your data
It efficiently migrates data from the source to the target through a network of conversions.
Reading Textfiles
from pyspark.sql import SparkSession
.config("spark.demo.config.option", "demo-value") \
df = spark.read.csv("input.csv",header=True,sep="|");
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Demo") \
.config("spark.demo.config.option", "demo-value") \
.getOrCreate()
df = spark.read.csv("input.csv",header=True,sep="|");
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Demo") \
.config("spark.demo.config.option", "demo-value") \
.getOrCreate()
df = spark.read.csv("input.csv",header=True,sep="|");
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext('local','example')
p_df= pd.read_csv('file.csv') # assuming the file contains a header
# p_df = pd.read_csv('file.csv', names = ['column 1','column 2']) # if no header
s_df = sql_sc.createDataFrame(pandas_df)
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pandas as pd
sc = SparkContext('local','example')
sql_sc = SQLContext(sc)
p_df= pd.read_csv('file.csv') # assuming the file contains a header
# p_df = pd.read_csv('file.csv', names = ['column 1','column 2']) # if no header
s_df = sql_sc.createDataFrame(pandas_df)
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pandas as pd
sc = SparkContext('local','example')
sql_sc = SQLContext(sc)
p_df= pd.read_csv('file.csv') # assuming the file contains a header
# p_df = pd.read_csv('file.csv', names = ['column 1','column 2']) # if no header
s_df = sql_sc.createDataFrame(pandas_df)
sc.textFile("file.csv") \
.map(lambda line: line.split(",")) \
.filter(lambda line: len(line)>1) \
.map(lambda line: (line[0],line[1])) \
sc.textFile("file.csv") \
.map(lambda line: line.split(",")) \
.filter(lambda line: len(line)>1) \
.map(lambda line: (line[0],line[1])) \
.collect()
sc.textFile("file.csv") \
.map(lambda line: line.split(",")) \
.filter(lambda line: len(line)>1) \
.map(lambda line: (line[0],line[1])) \
.collect()
sc.textFile("input.csv") \
.map(lambda line: line.split(",")) \
.filter(lambda line: len(line)<=1) \
sc.textFile("input.csv") \
.map(lambda line: line.split(",")) \
.filter(lambda line: len(line)<=1) \
.collect()
sc.textFile("input.csv") \
.map(lambda line: line.split(",")) \
.filter(lambda line: len(line)<=1) \
.collect()
"input.csv", header=True, mode="DROPMALFORMED", schema=schema
spark.read.csv(
"input.csv", header=True, mode="DROPMALFORMED", schema=schema
)
spark.read.csv(
"input.csv", header=True, mode="DROPMALFORMED", schema=schema
)
.option("header", "true")
.option("mode", "DROPMALFORMED")
(spark.read
.schema(schema)
.option("header", "true")
.option("mode", "DROPMALFORMED")
.csv("input.csv"))
(spark.read
.schema(schema)
.option("header", "true")
.option("mode", "DROPMALFORMED")
.csv("input.csv"))
Read CSV file with known structure
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
StructField("A", IntegerType()),
StructField("B", DoubleType()),
StructField("C", StringType())
.format("com.databricks.spark.csv")
.option("header", "true")
.option("mode", "DROPMALFORMED")
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
schema = StructType([
StructField("A", IntegerType()),
StructField("B", DoubleType()),
StructField("C", StringType())
])
(sqlContext
.read
.format("com.databricks.spark.csv")
.schema(schema)
.option("header", "true")
.option("mode", "DROPMALFORMED")
.load("input.csv"))
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
schema = StructType([
StructField("A", IntegerType()),
StructField("B", DoubleType()),
StructField("C", StringType())
])
(sqlContext
.read
.format("com.databricks.spark.csv")
.schema(schema)
.option("header", "true")
.option("mode", "DROPMALFORMED")
.load("input.csv"))