{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cfe01bb1-46b4-4866-83f9-2e2ab4077c12",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Для демо используется датасет https://www.kaggle.com/datasets/nelgiriyewithana/apple-quality/data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "fac684f3-8bca-45f5-b3f7-5c3e5ee6ab24",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+----+------------+------------+------------+------------+------------+------------+------------+-------+\n",
      "|A_id|        Size|      Weight|   Sweetness| Crunchiness|   Juiciness|    Ripeness|     Acidity|Quality|\n",
      "+----+------------+------------+------------+------------+------------+------------+------------+-------+\n",
      "|   0|-3.970048523|-2.512336381| 5.346329613|-1.012008712| 1.844900361| 0.329839797|-0.491590483|   good|\n",
      "|   1|-1.195217191|-2.839256528| 3.664058758| 1.588232309| 0.853285795| 0.867530082|-0.722809367|   good|\n",
      "|   2|-0.292023862|-1.351281995|-1.738429162|-0.342615928| 2.838635512|-0.038033328| 2.621636473|    bad|\n",
      "|   3|-0.657195773|-2.271626609| 1.324873847|-0.097874716| 3.637970491|-3.413761338| 0.790723217|   good|\n",
      "|   4|  1.36421682|-1.296611877|-0.384658206| -0.55300577| 3.030874354|-1.303849429| 0.501984036|   good|\n",
      "|   5|-3.425399755|-1.409082204|-1.913511195|-0.555774864| -3.85307147| 1.914615916|-2.981523169|    bad|\n",
      "|   6| 1.331605736| 1.635955715| 0.875974244| -1.67779794| 3.106344455|-1.847416733| 2.414170509|   good|\n",
      "|   7|-1.995462096| -0.42895848| 1.530643583|-0.742971676| 0.158834003| 0.974437858|-1.470125066|   good|\n",
      "|   8|-3.867632233|-3.734513576| 0.986429067| -1.20765455| 2.292872919| 4.080920787|-4.871904758|    bad|\n",
      "|   9|-0.727982709|-0.442820353|-4.092222827| 0.597512917| 0.393714261| 1.620856772| 2.185607723|    bad|\n",
      "|  10| -2.69933629|-1.329506988|-1.418506853|-0.625545768| 2.371074371| 3.403164523|-2.810808169|    bad|\n",
      "|  11| 2.450959845|-0.564177407|-1.635040727| 0.942399869| -2.08731667| 1.214321691| 1.294323927|   good|\n",
      "|  12|-0.170811719|-1.867271142|-1.771844728| 2.413155317|-3.094554738|-0.624884375|-2.076113997|    bad|\n",
      "|  13|-1.345530536|-1.623701121| 2.044143754| 1.754812932| 0.997567093| 0.434179855| 1.724026084|   good|\n",
      "|  14| 2.839580937|-0.344798192|-1.019797291| 0.894580857|-1.300060883|  0.58237862| 1.709708209|   good|\n",
      "|  15|-2.659887385|-2.795684208| 4.230403587| 0.697550395| 2.180911101|-0.088775396|-1.083620788|   good|\n",
      "|  16|-1.468951547|-1.950359588|-2.214372889| 0.909758509| 2.864448854| 3.965955655|-0.558208683|    bad|\n",
      "|  17|-0.074370177|-4.714749953| 0.249767641| 2.935319065|   1.4097551|-2.643810206| 1.250970347|   good|\n",
      "|  18|-0.302364267| 1.724395847|-2.442337223| 3.465108479| 0.449791675|-0.074362452| 2.493781985|    bad|\n",
      "|  19|-2.108049899| 0.356467403|-1.156193279| 4.326722517| 1.561543116|-4.630174264| -1.37665721|   good|\n",
      "+----+------------+------------+------------+------------+------------+------------+------------+-------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from datetime import datetime, date\n",
    "from pyspark.sql import Row\n",
    " \n",
    "ADMIN_REFRESH_TOKEN = \"<токен доступа>\"\n",
    " \n",
    "spark_connect_url = f\"sc://10.0.5.34:15002/;spark-token={ADMIN_REFRESH_TOKEN}\"\n",
    " \n",
    "spark = SparkSession.builder.remote(spark_connect_url).appName(\"Apple_ML\").getOrCreate()\n",
    " \n",
    "df = spark.read.csv(\"s3a://spark-k8s-6721-k8s-3d150103e71aaa-bucket/datasets/apple_quality.csv\", header=True, inferSchema=True)\n",
    "\n",
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "86e3ad57-8d28-4a70-82d3-860e1414b62f",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- A_id: integer (nullable = true)\n",
      " |-- Size: double (nullable = true)\n",
      " |-- Weight: double (nullable = true)\n",
      " |-- Sweetness: double (nullable = true)\n",
      " |-- Crunchiness: double (nullable = true)\n",
      " |-- Juiciness: double (nullable = true)\n",
      " |-- Ripeness: double (nullable = true)\n",
      " |-- Acidity: string (nullable = true)\n",
      " |-- Quality: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Просмотр схемы данных\n",
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "5cdae43a-5ed3-4b99-80fd-26327e1ae608",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4001"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6ecf703e-93b3-4653-a721-058dc5d8d4c5",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+-------------------+-------------------+--------------------+------------------+------------------+------------------+--------------------+-------+\n",
      "|summary|              A_id|               Size|             Weight|           Sweetness|       Crunchiness|         Juiciness|          Ripeness|             Acidity|Quality|\n",
      "+-------+------------------+-------------------+-------------------+--------------------+------------------+------------------+------------------+--------------------+-------+\n",
      "|  count|              4000|               4000|               4000|                4000|              4000|              4000|              4000|                4001|   4000|\n",
      "|   mean|            1999.5|-0.5030146298267509|-0.9895465445945022|-0.47047851978824984| 0.985477903858501|0.5121179684932494|0.4982774280305005| 0.07687729571600004|   NULL|\n",
      "| stddev|1154.8448669265786|   1.92805868885498| 1.6025072141517547|  1.9434406589204516|1.4027572042119638|1.9302856730942939|1.8744267757033415|  2.1102696364209197|   NULL|\n",
      "|    min|                 0|       -7.151703059|       -7.149847675|        -6.894485494|      -6.055057805|      -5.961897048|      -5.864598918|        -0.000635882|    bad|\n",
      "|    max|              3999|        6.406366899|         5.79071359|         6.374915513|       7.619851801|       7.364402864|       7.237836684|Created_by_Nidula...|   good|\n",
      "+-------+------------------+-------------------+-------------------+--------------------+------------------+------------------+------------------+--------------------+-------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Общая статистика\n",
    "df.describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "14c1b3bd-eee7-4154-9b88-73782cadcddd",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-------------------+-------------------+--------------------+\n",
      "|summary|               Size|             Weight|           Sweetness|\n",
      "+-------+-------------------+-------------------+--------------------+\n",
      "|  count|               4000|               4000|                4000|\n",
      "|   mean|-0.5030146298267509|-0.9895465445945022|-0.47047851978824984|\n",
      "| stddev|   1.92805868885498| 1.6025072141517547|  1.9434406589204516|\n",
      "|    min|       -7.151703059|       -7.149847675|        -6.894485494|\n",
      "|    max|        6.406366899|         5.79071359|         6.374915513|\n",
      "+-------+-------------------+-------------------+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Статистика по конкретным колонкам\n",
    "df.select(\"Size\", \"Weight\", \"Sweetness\").describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "5e524eaf-47fc-45cd-8248-f967194546ab",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----+\n",
      "|Quality|count|\n",
      "+-------+-----+\n",
      "|   NULL|    1|\n",
      "|    bad| 1996|\n",
      "|   good| 2004|\n",
      "+-------+-----+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import functions as F\n",
    "\n",
    "df.groupBy(\"Quality\").count().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "9942bcc5-7d08-4018-9306-019d64938130",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4000"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Чистим строку с Null\n",
    "df = df.filter(df.Quality.isNotNull())\n",
    "df.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "e27c2f24-5a46-4d05-b775-de54155b50f6",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Процентное соотношение яблок по качеству:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/datadisk/jupyter-admin/.local/lib/python3.9/site-packages/pyspark/sql/connect/expressions.py:948: UserWarning: WARN WindowExpression: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----+----------+\n",
      "|Quality|count|Percentage|\n",
      "+-------+-----+----------+\n",
      "|    bad| 1996|      49.9|\n",
      "|   good| 2004|      50.1|\n",
      "+-------+-----+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql.window import Window\n",
    "\n",
    "# Процентное соотношение яблок по качеству\n",
    "print(\"Процентное соотношение яблок по качеству:\")\n",
    "window = Window.partitionBy()  # создаем пустое окно для вычисления общей суммы\n",
    "df.groupBy(\"Quality\").count().withColumn(\n",
    "    \"Percentage\", \n",
    "    F.round(F.col(\"count\") / F.sum(\"count\").over(window) * 100, 2)\n",
    ").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "cd691761-e8c5-4296-b418-f480f55300e5",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Средние значения характеристик по качеству:\n",
      "+-------+--------+----------+-------------+---------------+-------------+\n",
      "|Quality|Avg_Size|Avg_Weight|Avg_Sweetness|Avg_Crunchiness|Avg_Juiciness|\n",
      "+-------+--------+----------+-------------+---------------+-------------+\n",
      "|    bad|   -0.97|     -0.99|        -0.96|            1.0|         0.01|\n",
      "|   good|   -0.03|     -0.99|         0.02|           0.97|         1.01|\n",
      "+-------+--------+----------+-------------+---------------+-------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Средние значения характеристик по качеству\n",
    "print(\"Средние значения характеристик по качеству:\")\n",
    "\n",
    "df.groupBy(\"Quality\").agg(\n",
    "    F.round(F.avg(\"Size\"), 2).alias(\"Avg_Size\"),\n",
    "    F.round(F.avg(\"Weight\"), 2).alias(\"Avg_Weight\"),\n",
    "    F.round(F.avg(\"Sweetness\"), 2).alias(\"Avg_Sweetness\"),\n",
    "    F.round(F.avg(\"Crunchiness\"), 2).alias(\"Avg_Crunchiness\"),\n",
    "    F.round(F.avg(\"Juiciness\"), 2).alias(\"Avg_Juiciness\")\n",
    ").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "30143b84-7979-477f-be25-c1742a54889e",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Распределение характеристик для каждого качества:\n",
      "\n",
      "Распределение Size:\n",
      "+-------+--------+-------+-----------+-------+--------+\n",
      "|Quality|Min_Size|Q1_Size|Median_Size|Q3_Size|Max_Size|\n",
      "+-------+--------+-------+-----------+-------+--------+\n",
      "|    bad|   -7.15|  -2.18|      -1.02|   0.22|    4.65|\n",
      "|   good|   -6.91|  -1.38|       0.03|   1.34|    6.41|\n",
      "+-------+--------+-------+-----------+-------+--------+\n",
      "\n",
      "\n",
      "Распределение Weight:\n",
      "+-------+----------+---------+-------------+---------+----------+\n",
      "|Quality|Min_Weight|Q1_Weight|Median_Weight|Q3_Weight|Max_Weight|\n",
      "+-------+----------+---------+-------------+---------+----------+\n",
      "|    bad|     -5.75|    -1.84|        -0.98|    -0.15|      3.08|\n",
      "|   good|     -7.15|    -2.22|         -1.0|     0.29|      5.79|\n",
      "+-------+----------+---------+-------------+---------+----------+\n",
      "\n",
      "\n",
      "Распределение Sweetness:\n",
      "+-------+-------------+------------+----------------+------------+-------------+\n",
      "|Quality|Min_Sweetness|Q1_Sweetness|Median_Sweetness|Q3_Sweetness|Max_Sweetness|\n",
      "+-------+-------------+------------+----------------+------------+-------------+\n",
      "|    bad|        -6.89|        -2.2|           -0.92|        0.31|         5.56|\n",
      "|   good|        -5.12|       -1.28|           -0.12|        1.24|         6.37|\n",
      "+-------+-------------+------------+----------------+------------+-------------+\n",
      "\n",
      "\n",
      "Распределение Crunchiness:\n",
      "+-------+---------------+--------------+------------------+--------------+---------------+\n",
      "|Quality|Min_Crunchiness|Q1_Crunchiness|Median_Crunchiness|Q3_Crunchiness|Max_Crunchiness|\n",
      "+-------+---------------+--------------+------------------+--------------+---------------+\n",
      "|    bad|          -2.62|          0.25|              0.98|          1.75|            6.3|\n",
      "|   good|          -6.06|         -0.13|              1.05|          2.04|           7.62|\n",
      "+-------+---------------+--------------+------------------+--------------+---------------+\n",
      "\n",
      "\n",
      "Распределение Juiciness:\n",
      "+-------+-------------+------------+----------------+------------+-------------+\n",
      "|Quality|Min_Juiciness|Q1_Juiciness|Median_Juiciness|Q3_Juiciness|Max_Juiciness|\n",
      "+-------+-------------+------------+----------------+------------+-------------+\n",
      "|    bad|        -5.96|       -1.38|             0.0|        1.43|         6.33|\n",
      "|   good|        -5.26|       -0.11|            1.02|        2.15|         7.36|\n",
      "+-------+-------------+------------+----------------+------------+-------------+\n",
      "\n",
      "\n",
      "Распределение Ripeness:\n",
      "+-------+------------+-----------+---------------+-----------+------------+\n",
      "|Quality|Min_Ripeness|Q1_Ripeness|Median_Ripeness|Q3_Ripeness|Max_Ripeness|\n",
      "+-------+------------+-----------+---------------+-----------+------------+\n",
      "|    bad|       -5.86|      -0.23|           0.96|       2.28|        7.24|\n",
      "|   good|       -5.31|      -1.19|           0.09|       1.18|        5.55|\n",
      "+-------+------------+-----------+---------------+-----------+------------+\n",
      "\n",
      "\n",
      "Распределение Acidity:\n",
      "+-------+-----------+----------+--------------+----------+-----------+\n",
      "|Quality|Min_Acidity|Q1_Acidity|Median_Acidity|Q3_Acidity|Max_Acidity|\n",
      "+-------+-----------+----------+--------------+----------+-----------+\n",
      "|    bad|        0.0|     -1.43|         -0.01|      1.59|        7.4|\n",
      "|   good|        0.0|     -1.33|          0.06|      1.45|       7.19|\n",
      "+-------+-----------+----------+--------------+----------+-----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Анализ распределения характеристик\n",
    "print(\"Распределение характеристик для каждого качества:\")\n",
    "for characteristic in [\"Size\", \"Weight\", \"Sweetness\", \"Crunchiness\", \"Juiciness\", \"Ripeness\", \"Acidity\"]:\n",
    "    print(f\"\\nРаспределение {characteristic}:\")\n",
    "    df.groupBy(\"Quality\").agg(\n",
    "        F.round(F.min(characteristic), 2).alias(f\"Min_{characteristic}\"),\n",
    "        F.round(F.percentile_approx(characteristic, 0.25), 2).alias(f\"Q1_{characteristic}\"),\n",
    "        F.round(F.percentile_approx(characteristic, 0.5), 2).alias(f\"Median_{characteristic}\"),\n",
    "        F.round(F.percentile_approx(characteristic, 0.75), 2).alias(f\"Q3_{characteristic}\"),\n",
    "        F.round(F.max(characteristic), 2).alias(f\"Max_{characteristic}\")\n",
    "    ).orderBy(\"Quality\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "26366ab7-5629-4a0c-996f-329eeeff8253",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------------+---------------+------------------+--------------------+------------------+-----------------+----------------+\n",
      "|Size_outliers|Weight_outliers|Sweetness_outliers|Crunchiness_outliers|Juiciness_outliers|Ripeness_outliers|Acidity_outliers|\n",
      "+-------------+---------------+------------------+--------------------+------------------+-----------------+----------------+\n",
      "|           56|            115|                95|                 105|                73|               71|              64|\n",
      "+-------------+---------------+------------------+--------------------+------------------+-----------------+----------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql.functions import col, when\n",
    "from pyspark.sql.types import DoubleType\n",
    "\n",
    "df = df.withColumn(\"Acidity\", col(\"Acidity\").cast(DoubleType()))\n",
    "\n",
    "def detect_outliers(df, column):\n",
    "    # Вычисляем квартили\n",
    "    quantiles = df.approxQuantile(column, [0.25, 0.75], 0.05)\n",
    "    Q1 = quantiles[0]\n",
    "    Q3 = quantiles[1]\n",
    "    IQR = Q3 - Q1\n",
    "    \n",
    "    # Определяем границы для выбросов\n",
    "    lower_bound = Q1 - 1.5 * IQR\n",
    "    upper_bound = Q3 + 1.5 * IQR\n",
    "    \n",
    "    # Помечаем выбросы\n",
    "    return df.withColumn(f\"{column}_outlier\", \n",
    "                         when((col(column) < lower_bound) | (col(column) > upper_bound), 1).otherwise(0))\n",
    "\n",
    "# Применяем функцию ко всем числовым колонкам\n",
    "numeric_columns = [\"Size\", \"Weight\", \"Sweetness\", \"Crunchiness\", \"Juiciness\", \"Ripeness\", \"Acidity\"]\n",
    "for column in numeric_columns:\n",
    "    df = detect_outliers(df, column)\n",
    "\n",
    "# Подсчитываем количество выбросов для каждой характеристики\n",
    "outlier_counts = df.select([F.sum(f\"{col}_outlier\").alias(f\"{col}_outliers\") for col in numeric_columns])\n",
    "outlier_counts.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "06a533d4-97b4-462d-a002-e8255cec423b",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Анализ влияния Size на качество:\n",
      "+-------------+----------+---------+-----------------+\n",
      "|Size_category|good_count|bad_count|  good_percentage|\n",
      "+-------------+----------+---------+-----------------+\n",
      "|         High|       959|      540|63.97598398932621|\n",
      "|          Low|       513|      835|38.05637982195846|\n",
      "|       Medium|       532|      621|46.14050303555941|\n",
      "+-------------+----------+---------+-----------------+\n",
      "\n",
      "\n",
      "Анализ влияния Weight на качество:\n",
      "+---------------+----------+---------+------------------+\n",
      "|Weight_category|good_count|bad_count|   good_percentage|\n",
      "+---------------+----------+---------+------------------+\n",
      "|           High|       797|      702|53.168779186124084|\n",
      "|            Low|       728|      620| 54.00593471810089|\n",
      "|         Medium|       479|      674| 41.54379878577624|\n",
      "+---------------+----------+---------+------------------+\n",
      "\n",
      "\n",
      "Анализ влияния Sweetness на качество:\n",
      "+------------------+----------+---------+-----------------+\n",
      "|Sweetness_category|good_count|bad_count|  good_percentage|\n",
      "+------------------+----------+---------+-----------------+\n",
      "|              High|       927|      572|61.84122748498999|\n",
      "|               Low|       505|      843|37.46290801186943|\n",
      "|            Medium|       572|      581|49.60971379011275|\n",
      "+------------------+----------+---------+-----------------+\n",
      "\n",
      "\n",
      "Анализ влияния Crunchiness на качество:\n",
      "+--------------------+----------+---------+-----------------+\n",
      "|Crunchiness_category|good_count|bad_count|  good_percentage|\n",
      "+--------------------+----------+---------+-----------------+\n",
      "|                High|       795|      704|53.03535690460307|\n",
      "|                 Low|       725|      623| 53.7833827893175|\n",
      "|              Medium|       484|      669| 41.9774501300954|\n",
      "+--------------------+----------+---------+-----------------+\n",
      "\n",
      "\n",
      "Анализ влияния Juiciness на качество:\n",
      "+------------------+----------+---------+------------------+\n",
      "|Juiciness_category|good_count|bad_count|   good_percentage|\n",
      "+------------------+----------+---------+------------------+\n",
      "|              High|       926|      573| 61.77451634422949|\n",
      "|               Low|       447|      901|33.160237388724035|\n",
      "|            Medium|       631|      522| 54.72679965307893|\n",
      "+------------------+----------+---------+------------------+\n",
      "\n",
      "\n",
      "Анализ влияния Ripeness на качество:\n",
      "+-----------------+----------+---------+------------------+\n",
      "|Ripeness_category|good_count|bad_count|   good_percentage|\n",
      "+-----------------+----------+---------+------------------+\n",
      "|             High|       537|      962|35.823882588392266|\n",
      "|              Low|       864|      484| 64.09495548961425|\n",
      "|           Medium|       603|      550|52.298352124891586|\n",
      "+-----------------+----------+---------+------------------+\n",
      "\n",
      "\n",
      "Анализ влияния Acidity на качество:\n",
      "+----------------+----------+---------+------------------+\n",
      "|Acidity_category|good_count|bad_count|   good_percentage|\n",
      "+----------------+----------+---------+------------------+\n",
      "|            High|       761|      738|50.767178118745825|\n",
      "|             Low|       653|      695| 48.44213649851632|\n",
      "|          Medium|       590|      563| 51.17085862966175|\n",
      "+----------------+----------+---------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql.functions import when, count\n",
    "\n",
    "def analyze_quality_relation(df, column):\n",
    "    # Разделяем значения на категории\n",
    "    categorized = df.withColumn(f\"{column}_category\",\n",
    "        when(col(column) <= df.approxQuantile(column, [0.33], 0.05)[0], \"Low\")\n",
    "        .when(col(column) <= df.approxQuantile(column, [0.66], 0.05)[0], \"Medium\")\n",
    "        .otherwise(\"High\")\n",
    "    )\n",
    "    \n",
    "    # Подсчитываем распределение качества для каждой категории\n",
    "    distribution = categorized.groupBy(f\"{column}_category\") \\\n",
    "        .agg(count(when(col(\"Quality\") == \"good\", 1)).alias(\"good_count\"),\n",
    "             count(when(col(\"Quality\") == \"bad\", 1)).alias(\"bad_count\")) \\\n",
    "        .withColumn(\"good_percentage\", col(\"good_count\") / (col(\"good_count\") + col(\"bad_count\")) * 100)\n",
    "    \n",
    "    return distribution\n",
    "\n",
    "# Применяем анализ ко всем числовым колонкам\n",
    "for column in numeric_columns:\n",
    "    print(f\"\\nАнализ влияния {column} на качество:\")\n",
    "    analyze_quality_relation(df, column).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "928489a5-23cc-41fd-ae40-abba61847798",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
