import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_excel("gtex_integrin_7_organs.xlsx")

data

brain_data = data[data['primary_site']=='Brain']
lung_data = data[data['primary_site']=='Lung']
brain_lung_data = data[data['primary_site'].isin(['Brain', 'Lung'])]
data_brain_lung_expression_only = brain_lung_data.iloc[:,1:]
data_brain_lung_expression_only

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#define wwhat is X and what is Y in your model
X=data_brain_lung_expression_only[['ITGA10']]
y=data_brain_lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")

Accuracy using ITGA10: 0.94

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: divide by zero encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: overflow encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: invalid value encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights

#switch from IGBA10 to ITGB4
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#define wwhat is X and what is Y in your model
X=data_brain_lung_expression_only[['ITGB4']]
y=data_brain_lung_expression_only['primary_site']

#define split between train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#define the model you want to use : logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

# predict and evaluate
y_pred = model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy using ITGA10: {accuracy:.2f}")

Accuracy using ITGA10: 0.81

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: divide by zero encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: overflow encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: invalid value encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights

#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_brain_lung_expression_only[['ITGB4']]  # 👈 Use your chosen integrin
y = data_brain_lung_expression_only['primary_site'].map({'Brain': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung) Using ITGB4 Expression')
plt.legend()
plt.grid(True)
plt.show()

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: divide by zero encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: overflow encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: invalid value encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights

#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_brain_lung_expression_only[['ITGA10']]  # 👈 Use your chosen integrin
y = data_brain_lung_expression_only['primary_site'].map({'Brain': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung) Using ITGA10 Expression')
plt.legend()
plt.grid(True)
plt.show()

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: divide by zero encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: overflow encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:330: RuntimeWarning: invalid value encountered in matmul
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights

#AUROC curve 
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
#Step 1: Prepare data
X = data_brain_lung_expression_only[['ITGA3', 'ITGB4']]  # 👈 Use your chosen integrin
y = data_brain_lung_expression_only['primary_site'].map({'Brain': 0, 'Lung': 1})  # Binary encoding

# Step 2: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 4: Predict probabilities
y_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for class "Lung"

# Step 5: Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

# Step 6: Plot
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # random guess line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Brain vs Lung) Using ITGA3 and ITGB4 Expression')
plt.legend()
plt.grid(True)
plt.show()

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:200: RuntimeWarning: divide by zero encountered in matmul
  raw_prediction = X @ weights + intercept
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:200: RuntimeWarning: overflow encountered in matmul
  raw_prediction = X @ weights + intercept
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:200: RuntimeWarning: invalid value encountered in matmul
  raw_prediction = X @ weights + intercept

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Prepare features and target
selected_genes = ['ITGA10', 'ITGB4'] 
#X = integrins.iloc[:, -27:]  # Assuming the last 27 columns are integrins
X = data[selected_genes]  # Assuming the last 27 columns are integrins
y = data['primary_site']

# Step 2: Encode organ labels as numbers
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 4: Train multinomial logistic regression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Step 5: Predict and evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7939698492462312

Classification Report:
              precision    recall  f1-score   support

 Bone Marrow       0.77      1.00      0.87        10
       Brain       0.81      0.94      0.87       247
      Breast       0.64      0.41      0.50        44
       Liver       1.00      0.65      0.79        23
        Lung       0.76      0.88      0.82        43
       Ovary       0.50      0.10      0.17        10
    Prostate       0.75      0.14      0.24        21

    accuracy                           0.79       398
   macro avg       0.75      0.59      0.61       398
weighted avg       0.78      0.79      0.77       398


Confusion Matrix:
[[ 10   0   0   0   0   0   0]
 [  3 231   3   0   8   1   1]
 [  0  25  18   0   1   0   0]
 [  0   8   0  15   0   0   0]
 [  0   4   1   0  38   0   0]
 [  0   6   0   0   3   1   0]
 [  0  12   6   0   0   0   3]]

/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:1247: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
  warnings.warn(
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:203: RuntimeWarning: divide by zero encountered in matmul
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:203: RuntimeWarning: overflow encountered in matmul
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:203: RuntimeWarning: invalid value encountered in matmul
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:336: RuntimeWarning: divide by zero encountered in matmul
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:336: RuntimeWarning: overflow encountered in matmul
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.py:336: RuntimeWarning: invalid value encountered in matmul
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights

data['primary_site'].value_counts()

primary_site
Brain          1152
Lung            288
Breast          179
Liver           110
Prostate        100
Ovary            88
Bone Marrow      70
Name: count, dtype: int64

data.shape

(1987, 29)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {accuracy:.2f}")

Accuracy: {accuracy:.2f}

	Unnamed: 0	primary_site	ITGA10	ITGAD	ITGAM	ITGA3	ITGBL1	ITGAE	ITGA2	ITGB3	...	ITGA6	ITGA2B	ITGB1	ITGAL	ITGA9	ITGB5	ITGA8	ITGA4	ITGA1	ITGA11
0	GTEX-13QIC-0011-R1a-SM-5O9CJ	Brain	0.5763	-6.5064	2.2573	0.7832	1.0363	4.6035	2.5731	-2.8262	...	2.8562	1.3846	5.8430	1.1316	-0.7108	3.5387	-0.0725	-0.4521	0.2029	-2.8262
1	GTEX-1399S-1726-SM-5L3DI	Lung	4.9137	-3.6259	4.7307	7.1584	1.7702	4.9556	1.9149	2.6067	...	4.2412	4.1211	7.7256	4.4900	2.9281	6.1483	5.1867	2.6185	4.7856	-0.0277
2	GTEX-PWCY-1326-SM-48TCU	Ovary	2.3953	-5.0116	1.4547	4.2593	-0.7346	4.4149	0.2642	1.5216	...	3.6816	1.5465	7.2964	-0.9406	2.7742	5.0414	2.0325	0.7579	2.2573	1.2516
3	GTEX-QXCU-0626-SM-2TC69	Lung	4.0541	-2.3147	4.5053	7.5651	4.1788	4.1772	5.3695	1.8444	...	4.9631	1.9149	7.9947	3.3911	2.8462	6.7683	4.1636	2.7951	5.3284	1.2147
4	GTEX-ZA64-1526-SM-5CVMD	Breast	2.0569	-2.4659	3.3993	3.1311	3.0074	4.4977	-1.7809	2.7139	...	4.7340	0.6332	7.3496	-0.9406	2.5338	6.5696	1.7229	-0.6416	3.1195	1.1050
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1982	GTEX-QMRM-0826-SM-3NB33	Lung	5.3067	-3.8160	4.9065	7.5810	5.8714	4.7345	2.6185	3.1095	...	5.6080	3.7324	8.2849	4.6201	3.6440	6.7052	5.1094	3.3364	5.8153	1.6604
1983	GTEX-YFCO-1626-SM-4W1Z3	Prostate	2.9581	-4.6082	1.1641	4.6938	1.5902	5.8625	-0.5125	1.7617	...	3.8798	-1.4699	7.5163	-0.3752	2.9562	5.3035	4.4304	-0.9406	3.6136	0.4233
1984	GTEX-1117F-2826-SM-5GZXL	Breast	4.3184	-6.5064	1.0433	4.8440	3.5498	4.6809	1.0293	3.3478	...	5.3256	-0.0725	7.7516	1.1382	2.1411	7.1132	0.3796	0.0854	3.8650	1.0151
1985	GTEX-Q2AG-2826-SM-2HMJQ	Brain	3.4622	-5.5735	1.5013	5.4835	1.7702	4.7517	0.6790	-3.1714	...	1.1960	4.1740	4.3002	0.5470	-0.9971	3.7982	-0.2498	1.4808	-0.5125	-0.5125
1986	GTEX-XV7Q-0426-SM-4BRVN	Lung	2.5585	-1.7809	6.7916	6.5865	2.7051	4.9519	4.3618	3.1892	...	3.5779	2.8974	7.7685	4.8294	1.9149	5.9989	2.4117	2.4198	4.2080	1.0007

	primary_site	ITGA10	ITGAD	ITGAM	ITGA3	ITGBL1	ITGAE	ITGA2	ITGB3	ITGA7	...	ITGA6	ITGA2B	ITGB1	ITGAL	ITGA9	ITGB5	ITGA8	ITGA4	ITGA1	ITGA11
0	Brain	0.5763	-6.5064	2.2573	0.7832	1.0363	4.6035	2.5731	-2.8262	4.9663	...	2.8562	1.3846	5.8430	1.1316	-0.7108	3.5387	-0.0725	-0.4521	0.2029	-2.8262
1	Lung	4.9137	-3.6259	4.7307	7.1584	1.7702	4.9556	1.9149	2.6067	3.9270	...	4.2412	4.1211	7.7256	4.4900	2.9281	6.1483	5.1867	2.6185	4.7856	-0.0277
3	Lung	4.0541	-2.3147	4.5053	7.5651	4.1788	4.1772	5.3695	1.8444	4.5355	...	4.9631	1.9149	7.9947	3.3911	2.8462	6.7683	4.1636	2.7951	5.3284	1.2147
5	Lung	6.0732	-2.4659	3.9901	7.3945	4.7688	5.1157	4.3356	2.3366	5.0527	...	3.7378	4.7247	7.5016	5.1396	2.5036	6.5443	4.6531	3.8136	5.8679	0.7407
6	Lung	4.2510	-5.0116	3.3076	6.1715	3.1129	5.2954	2.2960	1.1184	5.2392	...	4.7104	2.7530	7.5022	4.0730	2.6325	6.0483	5.0562	2.6962	5.1611	0.9343
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1980	Brain	0.6969	-6.5064	-0.9686	2.3760	-2.2447	4.0739	-0.6193	-4.0350	4.8788	...	2.7357	1.5806	4.6882	-0.9971	-0.5756	3.5136	0.9343	-1.0862	0.4340	-2.2447
1981	Brain	0.1124	-5.0116	2.2482	2.8897	-0.5125	4.6445	0.3115	-3.6259	4.5110	...	2.1147	0.9716	5.1202	0.6608	0.4761	3.2343	0.8408	-0.0574	-0.1828	-2.5479
1982	Lung	5.3067	-3.8160	4.9065	7.5810	5.8714	4.7345	2.6185	3.1095	5.2032	...	5.6080	3.7324	8.2849	4.6201	3.6440	6.7052	5.1094	3.3364	5.8153	1.6604
1985	Brain	3.4622	-5.5735	1.5013	5.4835	1.7702	4.7517	0.6790	-3.1714	5.3597	...	1.1960	4.1740	4.3002	0.5470	-0.9971	3.7982	-0.2498	1.4808	-0.5125	-0.5125
1986	Lung	2.5585	-1.7809	6.7916	6.5865	2.7051	4.9519	4.3618	3.1892	7.7121	...	3.5779	2.8974	7.7685	4.8294	1.9149	5.9989	2.4117	2.4198	4.2080	1.0007