import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, roc_curve, auc, confusion_matrix
# ── Generate and prepare dataset (same pipeline) ──
n_defective = int(n_samples * 0.05)
n_good = n_samples - n_defective
labels = np.array([0] * n_good + [1] * n_defective)
good_data = np.column_stack([
np.random.normal(3.3, 0.05, n_good),
np.random.normal(15.0, 1.0, n_good),
np.random.normal(10.0, 0.3, n_good),
np.random.normal(45.0, 2.0, n_good),
np.random.normal(2.0, 0.2, n_good),
def_data = np.column_stack([
np.random.normal(3.1, 0.15, n_defective),
np.random.normal(18.0, 3.0, n_defective),
np.random.normal(11.5, 1.0, n_defective),
np.random.normal(38.0, 5.0, n_defective),
np.random.normal(3.5, 0.8, n_defective),
X = np.vstack([good_data, def_data])
shuffle_idx = np.random.permutation(n_samples)
X, labels = X[shuffle_idx], labels[shuffle_idx]
X_train, X_test, y_train, y_test = train_test_split(
X, labels, test_size=0.3, random_state=42, stratify=labels
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_s, y_train)
y_proba = model.predict_proba(X_test_s)[:, 1]
# ── Precision-Recall Curve ──
precisions, recalls, pr_thresholds = precision_recall_curve(y_test, y_proba)
fpr, tpr, roc_thresholds = roc_curve(y_test, y_proba)
# ── Threshold Analysis ──
print("Effect of different decision thresholds:")
print(f"{'Threshold':<12} {'Precision':<12} {'Recall':<12} {'FP':<6} {'FN':<6}")
for threshold in [0.1, 0.2, 0.3, 0.5, 0.7, 0.9]:
y_pred_t = (y_proba >= threshold).astype(int)
cm = confusion_matrix(y_test, y_pred_t)
tp = cm[1, 1] if cm.shape[0] > 1 else 0
fp = cm[0, 1] if cm.shape[0] > 1 else 0
fn = cm[1, 0] if cm.shape[0] > 1 else 0
prec = tp / (tp + fp) if (tp + fp) > 0 else 0
rec = tp / (tp + fn) if (tp + fn) > 0 else 0
print(f"{threshold:<12.1f} {prec:<12.3f} {rec:<12.3f} {fp:<6d} {fn:<6d}")
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
axes[0].plot(recalls, precisions, color='steelblue', linewidth=2)
axes[0].set_xlabel('Recall')
axes[0].set_ylabel('Precision')
axes[0].set_title('Precision-Recall Curve')
axes[0].grid(True, alpha=0.3)
axes[0].set_xlim([0, 1.05])
axes[0].set_ylim([0, 1.05])
axes[1].plot(fpr, tpr, color='steelblue', linewidth=2, label=f'AUC = {roc_auc:.3f}')
axes[1].plot([0, 1], [0, 1], 'k--', alpha=0.3, label='Random classifier')
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate (Recall)')
axes[1].set_title('ROC Curve')
axes[1].grid(True, alpha=0.3)
# Probability distribution
axes[2].hist(y_proba[y_test == 0], bins=20, alpha=0.6, color='steelblue', label='Good boards')
axes[2].hist(y_proba[y_test == 1], bins=10, alpha=0.6, color='tomato', label='Defective boards')
axes[2].axvline(x=0.5, color='black', linestyle='--', label='Threshold = 0.5')
axes[2].set_xlabel('Predicted Probability of Defective')
axes[2].set_ylabel('Count')
axes[2].set_title('Probability Distribution by Class')
axes[2].grid(True, alpha=0.3)
plt.savefig('classification_curves.png', dpi=100)
print(f"\nROC AUC: {roc_auc:.3f}")
print("AUC = 1.0 means perfect separation. AUC = 0.5 means random guessing.")
print("\nPlot saved as classification_curves.png")
Comments