In [1]:
import pandas as pd
In [5]:
TCGA_BRCA_MC3_Public=pd.read_csv("mc3_BRCA_mc3.txt", sep='\t')
In [6]:
TCGA_BRCA_MC3_Public
Out[6]:
sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | TCGA-3C-AAAU-01 | 10 | 122668955 | 122668955 | G | A | WDR11 | 3'UTR | NaN | 0.39 | NaN | NaN |
1 | TCGA-3C-AAAU-01 | 10 | 8115874 | 8115875 | - | A | GATA3 | Frame_Shift_Ins | p.P409Afs*99 | 0.34 | NaN | NaN |
2 | TCGA-3C-AAAU-01 | 11 | 65272906 | 65272908 | AAA | - | MALAT1 | RNA | NaN | 0.27 | NaN | NaN |
3 | TCGA-3C-AAAU-01 | 11 | 66082467 | 66082467 | C | T | CD248 | Missense_Mutation | p.E678K | 0.07 | tolerated(0.12) | benign(0.001) |
4 | TCGA-3C-AAAU-01 | 11 | 66193652 | 66193652 | G | C | NPAS4 | 3'UTR | NaN | 0.20 | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
92114 | TCGA-Z7-A8R6-01 | 9 | 95396703 | 95396703 | C | T | IPPK | Missense_Mutation | p.E379K | 0.16 | deleterious(0.01) | probably_damaging(0.968) |
92115 | TCGA-Z7-A8R6-01 | X | 123217344 | 123217344 | C | T | STAG2 | Missense_Mutation | p.L1000F | 0.39 | deleterious(0) | probably_damaging(1) |
92116 | TCGA-Z7-A8R6-01 | X | 30671631 | 30671631 | G | A | GK | 5'UTR | NaN | 0.36 | NaN | NaN |
92117 | TCGA-Z7-A8R6-01 | X | 51151398 | 51151398 | C | G | CXorf67 | 3'UTR | NaN | 0.32 | NaN | NaN |
92118 | TCGA-Z7-A8R6-01 | X | 54014379 | 54014379 | T | A | PHF8 | Splice_Site | p.X613_splice | 0.07 | NaN | NaN |
92119 rows × 12 columns
In [11]:
mutation_type_counts=TCGA_BRCA_MC3_Public['effect'].value_counts()
In [12]:
mutation_type_counts
Out[12]:
effect Missense_Mutation 45634 Silent 17122 Frame_Shift_Del 8522 3'UTR 6695 Nonsense_Mutation 3666 Intron 3212 5'UTR 2492 Splice_Site 1399 RNA 1160 Frame_Shift_Ins 610 3'Flank 530 5'Flank 443 In_Frame_Del 441 Translation_Start_Site 74 Nonstop_Mutation 66 In_Frame_Ins 34 large deletion 19 Name: count, dtype: int64
In [13]:
sample_types=TCGA_BRCA_MC3_Public['sample'].value_counts()
In [14]:
sample_types
Out[14]:
sample TCGA-AC-A23H-01 6405 TCGA-EW-A2FV-01 4231 TCGA-D8-A27V-01 3332 TCGA-5L-AAT1-01 1995 TCGA-BH-A18G-01 1899 ... TCGA-AO-A03U-01 7 TCGA-A2-A25F-01 6 TCGA-LL-A440-01 6 TCGA-EW-A1P1-01 3 TCGA-AC-A2FK-01 3 Name: count, Length: 791, dtype: int64
In [16]:
gene_types=TCGA_BRCA_MC3_Public['gene'].value_counts()
In [26]:
gene_types
Out[26]:
gene PIK3CA 315 TTN 285 TP53 273 MUC16 141 CDH1 108 ... ZNF587B 1 BLZF1 1 SLPI 1 PLTP 1 FOXQ1 1 Name: count, Length: 18065, dtype: int64
In [32]:
#Plot the top 50 genes most frequently mutated
import matplotlib.pyplot as plt
top_genes = gene_types.head(25)
plt.figure(figsize=(10,6))
top_genes.plot(kind='barh', color='blue', edgecolor='black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 25 Most Frequently Mutated Genes in TCGA BRCA (MC3)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
In [38]:
TCGA_COAD_MC3_Public=pd.read_csv("mc3_COAD_mc3.txt", sep='\t')
In [39]:
TCGA_COAD_MC3_Public
Out[39]:
sample | chr | start | end | reference | alt | gene | effect | Amino_Acid_Change | DNA_VAF | SIFT | PolyPhen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | TCGA-3L-AA1B-01 | 10 | 124399948 | 124399948 | T | C | DMBT1 | Silent | p.Y2316Y | 0.17 | NaN | NaN |
1 | TCGA-3L-AA1B-01 | 10 | 128193330 | 128193330 | C | T | C10orf90 | Missense_Mutation | p.V147M | 0.28 | deleterious(0) | probably_damaging(0.999) |
2 | TCGA-3L-AA1B-01 | 10 | 49997997 | 49997997 | C | T | WDFY4 | Missense_Mutation | p.R1345W | 0.16 | deleterious(0) | probably_damaging(1) |
3 | TCGA-3L-AA1B-01 | 10 | 6533674 | 6533674 | C | A | PRKCQ | Missense_Mutation | p.G254V | 0.27 | deleterious(0) | probably_damaging(0.986) |
4 | TCGA-3L-AA1B-01 | 11 | 103153768 | 103153768 | C | T | DYNC2H1 | Missense_Mutation | p.P3622L | 0.31 | NaN | probably_damaging(0.977) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
215758 | TCGA-WS-AB45-01 | X | 91133412 | 91133412 | G | A | PCDH11X | Missense_Mutation | p.A725T | 0.16 | tolerated(0.33) | benign(0.004) |
215759 | TCGA-WS-AB45-01 | X | 9661224 | 9661224 | C | T | TBL1X | Silent | p.D309D | 0.21 | NaN | NaN |
215760 | TCGA-WS-AB45-01 | X | 9863424 | 9863424 | G | A | SHROOM2 | Silent | p.A492A | 0.15 | NaN | NaN |
215761 | TCGA-WS-AB45-01 | X | 99662857 | 99662857 | C | T | PCDH19 | Missense_Mutation | p.V247M | 0.15 | deleterious(0) | possibly_damaging(0.891) |
215762 | TCGA-WS-AB45-01 | X | 99905780 | 99905780 | A | G | SRPX2 | Splice_Site | p.X28_splice | 0.17 | NaN | NaN |
215763 rows × 12 columns
In [40]:
coad_mutation_type_counts=TCGA_COAD_MC3_Public['effect'].value_counts()
In [41]:
coad_mutation_type_counts
Out[41]:
effect Missense_Mutation 112275 Silent 44252 3'UTR 16054 Frame_Shift_Del 11290 Nonsense_Mutation 8575 Intron 6842 5'UTR 5293 Frame_Shift_Ins 3038 Splice_Site 2537 RNA 2416 In_Frame_Del 1019 5'Flank 937 3'Flank 909 Translation_Start_Site 139 Nonstop_Mutation 91 In_Frame_Ins 71 large deletion 25 Name: count, dtype: int64
In [43]:
coad_gene_types=TCGA_COAD_MC3_Public['gene'].value_counts()
In [44]:
coad_gene_types
Out[44]:
gene TTN 776 APC 365 MUC16 286 SYNE1 259 OBSCN 215 ... AREGB 1 FGFBP1 1 RP11-813I20.2 1 C2orf48 1 MIR507 1 Name: count, Length: 19329, dtype: int64
In [46]:
coad_top_genes = coad_gene_types.head(25)
plt.figure(figsize=(10,6))
coad_top_genes.plot(kind='barh', color='red', edgecolor='black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 25 Most Frequently Mutated Genes in TCGA COAD (MC3)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()