1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
| pd.set_option('expand_frame_repr', False)
pd.options.mode.chained_assignment = None
df = pd.read_csv(dir_taxonomy+"names.dmp", sep="|", names=["Description", "Strain", "Type", "Other"], index_col=0)
df = df.replace({' ':''}, regex=True)
df = df[(df["Type"] == "scientific name")]
df = df.drop(df.columns[[1, 2, 3]], axis=1)
df_test = pd.read_csv(file_test, header=0, sep='\t', index_col=0)
df.loc[0] = ['Unclassified']
df.loc[-1] = ['Trash']
df['Count'] = 0.0
for index, row in df_test.iterrows():
if row['seqID'] != 'unclassified':
if row['hitLength'] >= 30 and row['hitLength']/row['queryLength'] >= 0.7:
df.at[row['taxID'], 'Count'] = df.at[row['taxID'], 'Count'] + (1/row['numMatches'])
else:
df.at[-1, 'Count'] = df.at[-1, 'Count'] + (1/row['numMatches'])
else:
df.at[row['taxID'], 'Count'] = df.at[row['taxID'], 'Count'] + (1/row['numMatches'])
df = df[(df["Count"] != 0.0)]
df['Percent'] = round(df['Count']*100/sum(df['Count']),5)
df['Percent_informative'] = round(df['Count']*100/sum(df['Count'][:-2]),5)
df.at[0, 'Percent_informative'] = 0
df.at[-1, 'Percent_informative'] = 0
df['Count'] = round(df['Count'],2)
df = df.sort_values(['Count'], ascending=[0])
df.to_csv(file_output, header=True, index=True, sep='\t') |
Partager