|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": null, |
| 6 | + "id": "ff6ca0fb-e358-4dbe-9951-e5167758d030", |
| 7 | + "metadata": {}, |
| 8 | + "outputs": [], |
| 9 | + "source": [ |
| 10 | + "import datetime\n", |
| 11 | + "import gzip\n", |
| 12 | + "import json\n", |
| 13 | + "import time\n", |
| 14 | + "import os\n", |
| 15 | + "\n", |
| 16 | + "import dask\n", |
| 17 | + "from dask.distributed import Client\n", |
| 18 | + "import numpy as np\n", |
| 19 | + "import matplotlib.dates as mdates\n", |
| 20 | + "import matplotlib.pyplot as plt\n", |
| 21 | + "\n", |
| 22 | + "client = Client(\"tls://localhost:8786\")" |
| 23 | + ] |
| 24 | + }, |
| 25 | + { |
| 26 | + "cell_type": "code", |
| 27 | + "execution_count": null, |
| 28 | + "id": "d55ec891-e0c2-4f43-ba35-2077ecafcc07", |
| 29 | + "metadata": {}, |
| 30 | + "outputs": [], |
| 31 | + "source": [ |
| 32 | + "def get_input(max_size_GB = None):\n", |
| 33 | + " with gzip.open(\"file_metadata.json.gz\") as f:\n", |
| 34 | + " dataset_info = json.loads(f.read().decode())\n", |
| 35 | + "\n", |
| 36 | + " all_files = []\n", |
| 37 | + " all_sizes_GB = []\n", |
| 38 | + " for containers_for_category in dataset_info.values():\n", |
| 39 | + " for container, metadata in containers_for_category.items():\n", |
| 40 | + " if metadata[\"files_output\"] is None:\n", |
| 41 | + " continue\n", |
| 42 | + " for fname, size in zip(metadata[\"files_output\"], metadata[\"sizes_output_GB\"]):\n", |
| 43 | + " all_files.append(fname)\n", |
| 44 | + " all_sizes_GB.append(size)\n", |
| 45 | + " if max_size_GB and sum(all_sizes_GB) > max_size_GB:\n", |
| 46 | + " return all_files, all_sizes_GB\n", |
| 47 | + " return all_files, all_sizes_GB\n", |
| 48 | + "\n", |
| 49 | + "all_files, all_sizes_GB = get_input(max_size_GB = None) # limit list to specific total size\n", |
| 50 | + "print(f\"list of {len(all_files)} files with total size {sum(all_sizes_GB):.2f} GB\")" |
| 51 | + ] |
| 52 | + }, |
| 53 | + { |
| 54 | + "cell_type": "code", |
| 55 | + "execution_count": null, |
| 56 | + "id": "1c5cb8c1-ebf6-41f7-a12d-feff9a0456b8", |
| 57 | + "metadata": {}, |
| 58 | + "outputs": [], |
| 59 | + "source": [ |
| 60 | + "def run_xrdcp(fname, size):\n", |
| 61 | + " t0 = time.time()\n", |
| 62 | + " os.system(f\"xrdcp {fname} /dev/null -f\")\n", |
| 63 | + " t1 = time.time()\n", |
| 64 | + " return {\"t0\": t0, \"t1\": t1, \"GBread\": size}\n", |
| 65 | + "\n", |
| 66 | + "t0 = time.time()\n", |
| 67 | + "tasks = [dask.delayed(run_xrdcp)(fname, size) for fname, size in zip(all_files, all_sizes_GB)]\n", |
| 68 | + "res = dask.compute(*tasks)\n", |
| 69 | + "t1 = time.time()" |
| 70 | + ] |
| 71 | + }, |
| 72 | + { |
| 73 | + "cell_type": "markdown", |
| 74 | + "id": "c04faacc-8136-4ef9-910a-98b8660cb4d6", |
| 75 | + "metadata": {}, |
| 76 | + "source": [ |
| 77 | + "track egress: [link](https://grafana.mwt2.org/d/EKefjM-Sz/af-network-200gbps-challenge?orgId=1&from=now-1h&to=now&viewPanel=panel-205&refresh=5s)" |
| 78 | + ] |
| 79 | + }, |
| 80 | + { |
| 81 | + "cell_type": "code", |
| 82 | + "execution_count": null, |
| 83 | + "id": "dd03f327-857a-4d80-a013-3a13549de37f", |
| 84 | + "metadata": {}, |
| 85 | + "outputs": [], |
| 86 | + "source": [ |
| 87 | + "total_runtime_sum = sum(r[\"t1\"] - r[\"t0\"] for r in res)\n", |
| 88 | + "\n", |
| 89 | + "print(f\"processtime: {total_runtime_sum:.2f} s\")\n", |
| 90 | + "print(f\" -> data rate per worker: {sum(all_sizes_GB) * 8 / total_runtime_sum:.2f} Gbps\")\n", |
| 91 | + "\n", |
| 92 | + "print(f\"walltime: {t1-t0:.2f} s\")\n", |
| 93 | + "print(f\" -> total data rate: {sum(all_sizes_GB) * 8 / (t1-t0):.2f} Gbps\")\n", |
| 94 | + "\n", |
| 95 | + "starts = np.asarray([r[\"t0\"] for r in res])\n", |
| 96 | + "ends = np.asarray([r[\"t1\"] for r in res])\n", |
| 97 | + "GBread = [r[\"GBread\"] for r in res]\n", |
| 98 | + "rates_per_chunk = GBread / (ends - starts)\n", |
| 99 | + "\n", |
| 100 | + "t_samples = np.linspace(t0, t1, 100)\n", |
| 101 | + "rate_samples = []\n", |
| 102 | + "for t in t_samples:\n", |
| 103 | + " mask = np.logical_and((starts <= t), (t < ends))\n", |
| 104 | + " rate_samples.append(float(sum(rates_per_chunk[mask]) * 8))\n", |
| 105 | + "\n", |
| 106 | + "print(f\"total data read from data rate integral: {sum((t_samples[1] - t_samples[0]) * np.asarray(rate_samples)) / 8:.2f} GB\")\n", |
| 107 | + "t_samples = [datetime.datetime.fromtimestamp(t) for t in t_samples.tolist()]\n", |
| 108 | + "\n", |
| 109 | + "fig, ax = plt.subplots()\n", |
| 110 | + "ax.plot(t_samples, rate_samples, marker=\"v\", linewidth=0)\n", |
| 111 | + "ax.set_xlabel(\"time\")\n", |
| 112 | + "ax.tick_params(axis=\"x\", labelrotation=45)\n", |
| 113 | + "ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))\n", |
| 114 | + "ax.set_ylabel(\"data rate [Gbps]\")\n", |
| 115 | + "ax.set_ylim([0, ax.get_ylim()[1] * 1.1])\n", |
| 116 | + "fig.savefig(\"xrdcp_rate.png\")" |
| 117 | + ] |
| 118 | + } |
| 119 | + ], |
| 120 | + "metadata": { |
| 121 | + "kernelspec": { |
| 122 | + "display_name": "Python 3 (ipykernel)", |
| 123 | + "language": "python", |
| 124 | + "name": "python3" |
| 125 | + }, |
| 126 | + "language_info": { |
| 127 | + "codemirror_mode": { |
| 128 | + "name": "ipython", |
| 129 | + "version": 3 |
| 130 | + }, |
| 131 | + "file_extension": ".py", |
| 132 | + "mimetype": "text/x-python", |
| 133 | + "name": "python", |
| 134 | + "nbconvert_exporter": "python", |
| 135 | + "pygments_lexer": "ipython3", |
| 136 | + "version": "3.12.11" |
| 137 | + } |
| 138 | + }, |
| 139 | + "nbformat": 4, |
| 140 | + "nbformat_minor": 5 |
| 141 | +} |
0 commit comments