Добавленно EDA

2025-11-24 21:02:52 +03:00
--- a/main_new.py
+++ b/main_new.py
@@ -1,34 +1,135 @@
 import clickhouse_connect
 import time
-from scipy.sparse import lil_matrix, vstack
+import os
 from sklearn.preprocessing import normalize
 from sklearn.feature_extraction import FeatureHasher
 from sklearn.decomposition import TruncatedSVD
 import hdbscan
 import numpy as np
-# ----------------- Замер времени -----------------
+from scipy.sparse import lil_matrix, csr_matrix
-start_time = time.time()
+from sklearn.preprocessing import normalize
-start_time_all = start_time
+import hdbscan
-# ----------------- Подключение к ClickHouse -----------------
+from sklearn.metrics.pairwise import cosine_similarity
 from collections import defaultdict
 import gc
 import matplotlib.pyplot as plt
 from sklearn.decomposition import TruncatedSVD
 # Начало замера времени всего скрипта
 start_time_all = time.time()
 # Подключение к ClickHouse
 try:
    client = clickhouse_connect.get_client(
        host='localhost',
        port=8123,
        username='default',
        password='',
-        settings={
+        settings={'max_memory_usage': 4_000_000_000}
            'max_memory_usage': 4_000_000_000,
            'max_execution_time': 3600,
        }
    )
    print("Подключение к ClickHouse установлено")
 except Exception as e:
    print(f"Ошибка подключения: {e}")
    exit()
-interval = 5  # интервал времени в секундах
+# Интервал
 interval = 600  # 10 минут
-# ----------------- Получаем активные IP -----------------
+
 # ---------------------- EDA --------------------------
 # Создаем папку куда будем складывать графики
 os.makedirs("eda_plots", exist_ok=True)
 # События по минутам
 events_query = """
 SELECT
    toStartOfMinute(timestamp) AS ts,
    count() AS events
 FROM DIPLOM_NEW
 GROUP BY ts
 ORDER BY ts
 """
 events_result = client.query(events_query)
 events_ts = [row[0] for row in events_result.result_rows]
 events_cnt = [row[1] for row in events_result.result_rows]
 # Уникальные IP по минутам
 unique_query = """
 SELECT
    toStartOfMinute(timestamp) AS ts,
    uniqExact(ip) AS uniq_ips
 FROM DIPLOM_NEW
 GROUP BY ts
 ORDER BY ts
 """
 unique_result = client.query(unique_query)
 unique_ts = [row[0] for row in unique_result.result_rows]
 unique_cnt = [row[1] for row in unique_result.result_rows]
 # EDA статистика
 print("\n--- EDA статистика ---")
 events_np = np.array(events_cnt)
 unique_np = np.array(unique_cnt)
 mean_events = events_np.mean()
 std_events = events_np.std()
 mean_unique = unique_np.mean()
 std_unique = unique_np.std()
 print(f"Общее количество событий: {sum(events_cnt):,}")
 print(f"Среднее событий в минуту: {mean_events:.1f} ± {std_events:.1f}")
 print(f"Максимальная активность: {max(events_cnt)} событий/минуту")
 print(f"Среднее уникальных IP/минуту: {mean_unique:.1f} ± {std_unique:.1f}")
 # ЛОГИКА ОПРЕДЕЛЕНИЯ АТАК ---
 # Атаки определяются по правилу 3-х сигм
 # Значения > среднее + 3σ считаются статистически значимыми аномалиями(предпологаем что ботнет)
 #Анализируем данные
 attack_threshold_3sigma = mean_events + 3 * std_events  # 99.7% доверительный интервал
 attack_ts_3sigma = [(ts, cnt) for ts, cnt in zip(events_ts, events_cnt) if cnt > attack_threshold_3sigma]
 attack_ts = [ts for ts, cnt in attack_ts_3sigma]
 attack_counts = [cnt for ts, cnt in attack_ts_3sigma]
 # График событий с пометкой атак
 plt.figure(figsize=(16, 8))
 # Основной график событий
 plt.subplot(2, 1, 1)
 plt.plot(events_ts, events_cnt, 'b-', alpha=0.7, label='События')
 plt.axhline(y=mean_events, color='green', linestyle='--', alpha=0.7, label=f'Среднее ({mean_events:.1f})')
 plt.axhline(y=attack_threshold_3sigma, color='red', linestyle='--',label=f'Порог атаки 3σ ({attack_threshold_3sigma:.1f})')
 # Помечаем атаки красными точками
 attack_indices = [i for i, cnt in enumerate(events_cnt) if cnt > attack_threshold_3sigma]
 attack_x = [events_ts[i] for i in attack_indices]
 attack_y = [events_cnt[i] for i in attack_indices]
 plt.scatter(attack_x, attack_y, color='red', s=30, zorder=5, label='Обнаруженные атаки')
 plt.title("Количество событий по минутам с пометкой атак")
 plt.ylabel("Событий в минуту")
 plt.legend()
 plt.grid(True, alpha=0.3)
 # --- График уникальных IP ---
 plt.subplot(2, 1, 2)
 plt.plot(unique_ts, unique_cnt, color="orange", label='Уникальные IP')
 plt.axhline(y=mean_unique, color='green', linestyle='--', alpha=0.7, label=f'Среднее ({mean_unique:.1f})')
 # Помечаем те же интервалы атак
 attack_unique_y = [unique_cnt[i] for i in attack_indices]
 plt.scatter(attack_x, attack_unique_y, color='red', s=30, zorder=5, label='Интервалы атак')
 plt.title("Количество уникальных IP по минутам")
 plt.ylabel("Уникальных IP")
 plt.xlabel("Время")
 plt.legend()
 plt.grid(True, alpha=0.3)
 plt.tight_layout()
 plt.savefig("eda_plots/events_with_attacks.png", dpi=150)
 plt.close()
 print("EDA завершено. Графики сохранены в eda_plots/events_with_attacks.png\n")
 # Получение активных IP
 active_ips_query = """
 SELECT ip
 FROM DIPLOM_NEW
@@ -37,26 +138,31 @@ HAVING COUNT() >= 25
 """
 active_ips_result = client.query(active_ips_query)
 data_IP = [row[0] for row in active_ips_result.result_rows]
 print(f"Активных IP (≥25 событий): {len(data_IP)}")
 ip_to_idx = {ip: idx for idx, ip in enumerate(data_IP)}
 print(f"Активных IP: {len(data_IP)}")
-# ----------------- Временной диапазон -----------------
+
 # Временной диапазон
 time_range_query = "SELECT MIN(timestamp), MAX(timestamp) FROM DIPLOM_NEW"
-time_result = client.query(time_range_query)
+min_time, max_time = client.query(time_range_query).result_rows[0]
-min_time, max_time = time_result.result_rows[0]
+
-lenn = int((max_time.timestamp() - min_time.timestamp()) // interval) + 1
+lenn = int((max_time.timestamp() - min_time.timestamp()) // interval) + 1
-print(f"Интервалов: {lenn}")
+print(f"Временных интервалов по {interval} сек: {lenn}")
 # Формируем разреженную матрицу
 print("Строим разреженную матрицу активности...")
 # ----------------- Создание матрицы активности -----------------
 print("Формируем разреженную матрицу активности...")
 activity_matrix = lil_matrix((len(data_IP), lenn), dtype=np.int8)
-batch_size = 10000 # для обработки частями, чтобы не былоо ошибок по памяти
+batch_size = 10000
 client.command("DROP TEMPORARY TABLE IF EXISTS active_ips")
 client.command("CREATE TEMPORARY TABLE active_ips (ip String)")
 for i in range(0, len(data_IP), batch_size):
-    batch = data_IP[i:i+batch_size]
+    batch = data_IP[i:i + batch_size]
    values = ", ".join([f"('{ip}')" for ip in batch])
    client.command(f"INSERT INTO active_ips VALUES {values}")
@@ -66,117 +172,147 @@ FROM DIPLOM_NEW t
 INNER JOIN active_ips a ON t.ip = a.ip
 ORDER BY t.timestamp
 """
 data_result = client.query(data_query)
 data = data_result.result_rows
-for ip, ts in data:
+for ip, ts in client.query(data_query).result_rows:
    row = ip_to_idx[ip]
    col = int((ts.timestamp() - min_time.timestamp()) // interval)
    activity_matrix[row, col] = 1
 activity_matrix = activity_matrix.tocsr()
-print(f"Время подготовки данных: {time.time() - start_time:.2f} сек")
+print("Матрица построена.\n")
-# ----------------- Хеширование -----------------
+# ------------- Нормализация и снижение размерности -------------
-use_fraction = 20 # на сколько мы делим число интервалов
+activity_matrix = normalize(activity_matrix, norm="l2")
-n_features = max(1, lenn // use_fraction)
+reducer = TruncatedSVD(
    n_components=50,
    random_state=42,
    algorithm='randomized'
 )
-hasher = FeatureHasher(n_features=n_features, input_type='pair')
+X_reduced = reducer.fit_transform(activity_matrix)
 batch_size = 1000
 batches = len(data_IP) // batch_size + 1
 X_hashed_parts = []
 for b in range(batches):
    start = b * batch_size
    end = min((b + 1) * batch_size, len(data_IP))
    batch_rows = []
    for i in range(start, end):
        row = activity_matrix[i]
        if row.nnz == 0:
            batch_rows.append(hasher.transform([[]]))
        else:
            features = [(str(idx), 1) for idx in row.indices]
            batch_rows.append(hasher.transform([features]))
    batch_matrix = vstack(batch_rows)
    X_hashed_parts.append(batch_matrix)
    del batch_rows
 # Объединяем части матрицы
 X_hashed = vstack(X_hashed_parts)
 del X_hashed_parts
-
+# ----------------- КЛАСТЕРИЗАЦИЯ ---------------------
 # Нормализация
 X_hashed = normalize(X_hashed, norm='l2', axis=1)
 # Снижение размерности
 svd_components = 100  # колличество компонент
 svd = TruncatedSVD(n_components=svd_components)
 X_reduced = svd.fit_transform(X_hashed)
 # ----------------- Кластеризация -----------------
 print("Запускаем HDBSCAN...")
 clusterer = hdbscan.HDBSCAN(
    min_cluster_size=5,
    min_samples=3,
-    metric='euclidean',
+    metric="euclidean",
-    cluster_selection_epsilon=0.05,
+    cluster_selection_method="eom"
    cluster_selection_method='eom'
 )
 labels = clusterer.fit_predict(X_reduced)
 print("Кластеризация завершена.\n")
 # ----------------- Подсчёт внутрикластерной схожести -----------------
 from sklearn.metrics.pairwise import cosine_similarity
 from collections import defaultdict
 import gc
-print("Вычисляем внутрикластерную схожесть (cosine similarity)...")
+# ---------АНАЛИЗ КЛАСТЕРОВ-------------------------
-cluster_similarities = {}
+print("\nАнализируем кластеры...")
 #Получим кластеры без шума
 clusters = defaultdict(list)
 for ip, label in zip(data_IP, labels):
-    clusters[label].append(ip)
+    if label != -1:
        clusters[label].append(ip)
-for cluster_id in set(labels):
+# Преобразуем EDA интервалы атак в индексы колонок
-    if cluster_id == -1:
+attack_intervals = [
-        continue
+    int((ts.timestamp() - min_time.timestamp()) // interval)
    for ts in attack_ts
 ]
-    indices = [i for i, l in enumerate(labels) if l == cluster_id]
+# ПОИСК ПОДОЗРИТЕЛЬНЫХ КЛАСТЕРОВ
-    n = len(indices)
+cluster_stats = []
-    # Ограничиваем слишком большие кластеры, чтобы не было проблем по памяти
+for cid, ips in clusters.items():
-    if n > 500:
+    indices = [ip_to_idx[ip] for ip in ips]
-        sample_indices = np.random.choice(indices, size=500, replace=False)
+
    total_activity = 0
    attack_activity = 0
    # Считаем активность
    for idx in indices:
        row = activity_matrix[idx]
        total_activity += row.sum()
        # активность в интервалы EDA-атак
        for ai in attack_intervals:
            if ai < row.shape[1]:
                attack_activity += row[0, ai]
    attack_percent = (attack_activity / total_activity * 100) if total_activity else 0
    # Считаем подозрительных критериев
    danger_score = 0
    if len(ips) >= 8:                 # крупный кластер
        danger_score += 1
    if attack_percent >= 10:          # участвует в EDA-аномалиях
        danger_score += 1
    if total_activity / len(ips) > 5: # активность на один IP высокая
        danger_score += 1
    cluster_stats.append({
        'cluster_id': cid,
        'ips': ips,
        'ip_indices': indices,
        'ip_count': len(ips),
        'attack_percent': attack_percent,
        'total_activity': int(total_activity),
        'danger_score': danger_score
    })
 # Сортируем по подозрительности (без учета similarity!)
 cluster_stats.sort(key=lambda x: x['danger_score'], reverse=True)
 # Смотрим схожесть внутри кластеров
 def calc_cluster_similarity(ip_indices, max_size=500):
    if len(ip_indices) > max_size:
        sample = np.random.choice(ip_indices, size=max_size, replace=False)
    else:
-        sample_indices = indices
+        sample = ip_indices
-    submatrix = activity_matrix[sample_indices]
+    sims = cosine_similarity(activity_matrix[sample])
    return float(sims.mean())
    sims = cosine_similarity(submatrix, dense_output=False)
    mean_sim = sims.mean()
    cluster_similarities[cluster_id] = float(mean_sim)
    gc.collect()
-# ----------------- Сохранение результатов -----------------
+#Добавляем информацию о схожести
-output_file = "botnet_clusters.txt"
+for c in cluster_stats:
    c['similarity'] = calc_cluster_similarity(c['ip_indices'])
 with open(output_file, "w") as f:
    for cluster_id, ips in clusters.items():
        if cluster_id == -1:
            continue
        sim = cluster_similarities.get(cluster_id, 0)
        f.write(f"=== Кластер {cluster_id} (IP: {len(ips)}, средняя схожесть={sim:.4f}) ===\n")
        for ip in ips:
            f.write(ip + "\n")
        f.write("\n")
-# ----------------- Финальная статистика -----------------
+# ----ФИНАЛЬНОЕ РЕШЕНИЕ: БОТНЕТ / НЕ БОТНЕТ----------------+
-n_clusters = len([c for c in clusters if c != -1])
+
-print(f"Найдено кластеров: {n_clusters}")
+def final_botnet_decision(c):
-print(f"Результаты сохранены в: {output_file}")
+    """
-print(f"Время обработки всего скрипта: {time.time()-start_time_all:.2f}")
+    Логика такая:
    - danger_score — первичный индикатор (до similarity)
    - similarity — контрольная проверка координации
    """
    if c['danger_score'] < 2:   # слишком низкая подозрительность
        return False
    if c['similarity'] < 0.6:   # нет координации — не ботнет
        return False
    return True                 # ботнет подтверждён
 botnets = [c for c in cluster_stats if final_botnet_decision(c)]
 normals = [c for c in cluster_stats if not final_botnet_decision(c)]
 print(f"\nИтого ботнет-кластеров: {len(botnets)}")
 print(f"Всего IP в ботнетах: {sum(c['ip_count'] for c in botnets)}")
 # -----СОХРАНЯЕМ ТОЛЬКО БОТНЕТЫ------------------------------
 with open("botnet_clusters.txt", "w", encoding='utf-8') as f:
    for i, c in enumerate(botnets, 1):
        f.write(f"\n--- БОТНЕТ #{i} ---\n")
        f.write(f"Количество IP: {c['ip_count']}\n")
        f.write(f"Участие в атаках(EDA): {c['attack_percent']:.1f}%\n")
        f.write(f"Схожесть: {c['similarity']:.3f}\n\n")
        f.write("Список IP:\n")
        for ip in c['ips']:
            f.write(f"  {ip}\n")
 print("\nРезультаты записаны в botnet_clusters.txt")
 print(f"Время выполнения: {time.вtime() - start_time_all:.2f} сек")