@
momocraft #10 是的,就是 tsv,合并的时候没问题,一往外写就出问题了。(第一次用 pyspark,可能真的是自己的问题。
BName = str(os.path.basename(bg_f[0]).split('.')[0])
schema = StructType([
StructField('CataID', StringType(), True),
StructField('Start_Block', IntegerType(), True),
StructField('End_Block', IntegerType(), True),
StructField(BName, IntegerType(), True)
])
temp = sqlContext.read.csv(bg_f[0], sep='\t', header=False, schema=schema)
for p in bg_f[1:]:
SName = str(os.path.basename(p).split('.')[0])
schema = StructType([
StructField('CataID', StringType(), True),
StructField('Start_Block', IntegerType(), True),
StructField('End_Block', IntegerType(), True),
StructField(BName, IntegerType(), True)
])
cur = sqlContext.read.csv(p, sep='\t', header=False, schema=schema)
temp = temp.join(cur,
on=['CataID', 'Start_Block', 'End_Block'],
how='outer')
temp = temp.drop('CataID', 'Start_Block', 'End_Block')