In [1]:
import pandas as pd
In [5]:
TCGA_BRCA_MC3_Public=pd.read_csv("mc3_BRCA_mc3.txt", sep='\t')
In [6]:
TCGA_BRCA_MC3_Public
Out[6]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
0 TCGA-3C-AAAU-01 10 122668955 122668955 G A WDR11 3'UTR NaN 0.39 NaN NaN
1 TCGA-3C-AAAU-01 10 8115874 8115875 - A GATA3 Frame_Shift_Ins p.P409Afs*99 0.34 NaN NaN
2 TCGA-3C-AAAU-01 11 65272906 65272908 AAA - MALAT1 RNA NaN 0.27 NaN NaN
3 TCGA-3C-AAAU-01 11 66082467 66082467 C T CD248 Missense_Mutation p.E678K 0.07 tolerated(0.12) benign(0.001)
4 TCGA-3C-AAAU-01 11 66193652 66193652 G C NPAS4 3'UTR NaN 0.20 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
92114 TCGA-Z7-A8R6-01 9 95396703 95396703 C T IPPK Missense_Mutation p.E379K 0.16 deleterious(0.01) probably_damaging(0.968)
92115 TCGA-Z7-A8R6-01 X 123217344 123217344 C T STAG2 Missense_Mutation p.L1000F 0.39 deleterious(0) probably_damaging(1)
92116 TCGA-Z7-A8R6-01 X 30671631 30671631 G A GK 5'UTR NaN 0.36 NaN NaN
92117 TCGA-Z7-A8R6-01 X 51151398 51151398 C G CXorf67 3'UTR NaN 0.32 NaN NaN
92118 TCGA-Z7-A8R6-01 X 54014379 54014379 T A PHF8 Splice_Site p.X613_splice 0.07 NaN NaN

92119 rows × 12 columns

In [11]:
mutation_type_counts=TCGA_BRCA_MC3_Public['effect'].value_counts()
In [12]:
mutation_type_counts
Out[12]:
effect
Missense_Mutation         45634
Silent                    17122
Frame_Shift_Del            8522
3'UTR                      6695
Nonsense_Mutation          3666
Intron                     3212
5'UTR                      2492
Splice_Site                1399
RNA                        1160
Frame_Shift_Ins             610
3'Flank                     530
5'Flank                     443
In_Frame_Del                441
Translation_Start_Site       74
Nonstop_Mutation             66
In_Frame_Ins                 34
large deletion               19
Name: count, dtype: int64
In [13]:
sample_types=TCGA_BRCA_MC3_Public['sample'].value_counts()
In [14]:
sample_types
Out[14]:
sample
TCGA-AC-A23H-01    6405
TCGA-EW-A2FV-01    4231
TCGA-D8-A27V-01    3332
TCGA-5L-AAT1-01    1995
TCGA-BH-A18G-01    1899
                   ... 
TCGA-AO-A03U-01       7
TCGA-A2-A25F-01       6
TCGA-LL-A440-01       6
TCGA-EW-A1P1-01       3
TCGA-AC-A2FK-01       3
Name: count, Length: 791, dtype: int64
In [16]:
gene_types=TCGA_BRCA_MC3_Public['gene'].value_counts()
In [26]:
gene_types
Out[26]:
gene
PIK3CA     315
TTN        285
TP53       273
MUC16      141
CDH1       108
          ... 
ZNF587B      1
BLZF1        1
SLPI         1
PLTP         1
FOXQ1        1
Name: count, Length: 18065, dtype: int64
In [32]:
#Plot the top 50 genes most frequently mutated
import matplotlib.pyplot as plt

top_genes = gene_types.head(25)

plt.figure(figsize=(10,6))
top_genes.plot(kind='barh', color='blue', edgecolor='black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 25 Most Frequently Mutated Genes in TCGA BRCA (MC3)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [38]:
TCGA_COAD_MC3_Public=pd.read_csv("mc3_COAD_mc3.txt", sep='\t')
In [39]:
TCGA_COAD_MC3_Public
Out[39]:
sample chr start end reference alt gene effect Amino_Acid_Change DNA_VAF SIFT PolyPhen
0 TCGA-3L-AA1B-01 10 124399948 124399948 T C DMBT1 Silent p.Y2316Y 0.17 NaN NaN
1 TCGA-3L-AA1B-01 10 128193330 128193330 C T C10orf90 Missense_Mutation p.V147M 0.28 deleterious(0) probably_damaging(0.999)
2 TCGA-3L-AA1B-01 10 49997997 49997997 C T WDFY4 Missense_Mutation p.R1345W 0.16 deleterious(0) probably_damaging(1)
3 TCGA-3L-AA1B-01 10 6533674 6533674 C A PRKCQ Missense_Mutation p.G254V 0.27 deleterious(0) probably_damaging(0.986)
4 TCGA-3L-AA1B-01 11 103153768 103153768 C T DYNC2H1 Missense_Mutation p.P3622L 0.31 NaN probably_damaging(0.977)
... ... ... ... ... ... ... ... ... ... ... ... ...
215758 TCGA-WS-AB45-01 X 91133412 91133412 G A PCDH11X Missense_Mutation p.A725T 0.16 tolerated(0.33) benign(0.004)
215759 TCGA-WS-AB45-01 X 9661224 9661224 C T TBL1X Silent p.D309D 0.21 NaN NaN
215760 TCGA-WS-AB45-01 X 9863424 9863424 G A SHROOM2 Silent p.A492A 0.15 NaN NaN
215761 TCGA-WS-AB45-01 X 99662857 99662857 C T PCDH19 Missense_Mutation p.V247M 0.15 deleterious(0) possibly_damaging(0.891)
215762 TCGA-WS-AB45-01 X 99905780 99905780 A G SRPX2 Splice_Site p.X28_splice 0.17 NaN NaN

215763 rows × 12 columns

In [40]:
coad_mutation_type_counts=TCGA_COAD_MC3_Public['effect'].value_counts()
In [41]:
coad_mutation_type_counts
Out[41]:
effect
Missense_Mutation         112275
Silent                     44252
3'UTR                      16054
Frame_Shift_Del            11290
Nonsense_Mutation           8575
Intron                      6842
5'UTR                       5293
Frame_Shift_Ins             3038
Splice_Site                 2537
RNA                         2416
In_Frame_Del                1019
5'Flank                      937
3'Flank                      909
Translation_Start_Site       139
Nonstop_Mutation              91
In_Frame_Ins                  71
large deletion                25
Name: count, dtype: int64
In [43]:
coad_gene_types=TCGA_COAD_MC3_Public['gene'].value_counts()
In [44]:
coad_gene_types
Out[44]:
gene
TTN              776
APC              365
MUC16            286
SYNE1            259
OBSCN            215
                ... 
AREGB              1
FGFBP1             1
RP11-813I20.2      1
C2orf48            1
MIR507             1
Name: count, Length: 19329, dtype: int64
In [46]:
coad_top_genes = coad_gene_types.head(25)

plt.figure(figsize=(10,6))
coad_top_genes.plot(kind='barh', color='red', edgecolor='black')
plt.xlabel("Number of Mutations")
plt.ylabel("Gene")
plt.title("Top 25 Most Frequently Mutated Genes in TCGA COAD (MC3)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
No description has been provided for this image