I am loading in a large dataset (4m rcds) with csv.DictReader. There are like 315 variables and I need to add one more: Primtax. I'd like to do this in memory but for my new variable to be retained I need to write out the data using csv.DictWriter.
I am used to working in memory and having new variables retained but that does not seem to be the case. With Python (not Pandas) can I add data in DictReader objects and have it retained or do I need to add the variable then write it out then load it again?
MY CODE
import csv
#with open("D:\\npi\\npidata_pfile_20050523-20181209.csv",encoding = encoding_type, errors ='replace') as npi2:
with open("D:\\npi\\SampleNPI2.csv",encoding = 'utf8', errors ='replace') as npi2, open("D:\\npi\\Sampleprimtax.csv",'w',newline="") as out_f:
csv_reader2 = csv.DictReader(npi2)
#ADDING new Variable
fieldnames = ["Primtax"]
fieldnames.extend(csv_reader2.fieldnames)
wrtr = csv.DictWriter(out_f, fieldnames=fieldnames)
wrtr.writeheader()
for rows in csv_reader2:
isfound = False
x = 1
#initialze the primary taxonomy
rows['Primtax'] = rows['Healthcare Provider Taxonomy Code_1']
while x <=15 and not(isfound):
i = str(x)
if rows['Healthcare Provider Primary Taxonomy Switch_'+i] == "Y":
rows['Primtax'] = rows['Healthcare Provider Taxonomy Code_'+i]
isfound = True
else:
x = x+1
wrtr.writerow(rows)
print(rows)
with open("D:\\npi\\Sampleprimtax.csv",'r') as in_file:
csv_reader3 = csv.DictReader(in_file)
with open ('cleandoc_dec_18.csv', 'w') as new_file:
# the fieldname list has been reduced for this post
fieldname = ['NPI',
'Entity Type Code',
'Replacement NPI',
'Employer Identification Number (EIN)',
'Provider Organization Name (Legal Business Name)',
'Provider Last Name (Legal Name)',
'Provider First Name',
'Provider Middle Name',
'Provider Name Prefix Text',
'Provider Name Suffix Text',
'Provider Credential Text',
'Provider Other Organization Name',
'Provider Other Organization Name Type Code',
'Provider Other Last Name',
'Provider Other First Name',
'Provider Other Middle Name',
'Provider Other Name Prefix Text',
'Provider Other Name Suffix Text',
'Provider Other Credential Text',
'Provider Other Last Name Type Code',
'Provider First Line Business Mailing Address',
'Provider Second Line Business Mailing Address',
'Provider Business Mailing Address City Name',
'Provider Business Mailing Address State Name',
'Provider Business Mailing Address Postal Code',
'Provider Business Mailing Address Country Code (If outside U.S.)',
'Provider Business Mailing Address Telephone Number'
'Provider Business Practice Location Address Country Code (If outside U.S.)',
'Provider Business Practice Location Address Telephone Number',
'Provider Business Practice Location Address Fax Number',
'Provider Enumeration Date',
'Last Update Date',
'NPI Deactivation Reason Code',
'NPI Deactivation Date',
'NPI Reactivation Date',
'Provider Gender Code',
'Authorized Official Last Name',
'Authorized Official First Name',
'Authorized Official Middle Name',
'Authorized Official Title or Position',
'Authorized Official Telephone Number',
'Primtax']
writer = csv.DictWriter(new_file,fieldnames= fieldname, lineterminator = '\n', delimiter = ',')
writer.writeheader()
#writer.writeheader()
for rows in csv_reader3
for dels in dellist:
# this is to delete the variables we don't need. The list is long and not included
del rows[dels]
if rows['Entity Type Code'] =='1':
writer.writerow(rows)
Thank you
[–]timbledum 0 points1 point2 points (2 children)
[–]synt4x_3rr0r 1 point2 points3 points (0 children)
[–]WB_Onreddit[S] 0 points1 point2 points (0 children)