{ "personal_info": { "name": "Umang Bhalla", "location": "smwhere in india", "description": "platform/mlops/devops/llmops/llm-agents engineer, shitpost demigod", "born": "March 2001", "other_links": { "github": "https://github.com/umgbhalla", "x": "https://x.com/umgbhalla", "discord": "umgbhalla", "linkedin": "https://linkedin.com/in/umgbhalla", "resume": "dm on x.com/umgbhalla" } }, "timezone": "GMT+5:30", "work_experience": [ { "year": "2024-present", "company": "NewEngen.com", "role": [ "SRE I", "SRE II" ], "location": "remote", "hq": "us", "work": [ "migration from gcp cloud run to kubernetes", "infrastructure optimization reducing compute cloud spend by 80%", "istio service mesh infra design for k8s native api gateway/ingress", "custom api layer auth with firebase integrated across all endpoints", "ci/cd pipeline automation for 50+ microservices, and general gitops implementation", "internal org-wide tool design+infra for campaign/ads/adset level taxonomy collection", "internal llm agent design and implementation for marketing analytics, report generation, and trend analysis systems on 2+tb of bq data", "transformer-based recommendation system for taxonomy tool including training pipeline, model refresh cycles, and evaluation systems", "internal dev tools for model train and rollout, agent evals, and secret management" ], "tech": [ "kubernetes", "gcp", "istio", "firebase", "python", "bash", "prometheus", "grafana", "agno", "pytorch", "qdrant" ] }, { "year": "2023-2024", "company": "Reint.ai", "role": "MLOps Consultant", "location": "remote", "hq": "au", "work": [ "mlops monitoring system, version control and metric logging for internal model train runs", "high frequency monitoring and alerting system for vms, and prod apis", "custom training infra to utilise multiple gpus, spot instances, and resume training from last checkpoint for long runs", "ablation studies on tranformer models for time series forecasting", "logging and stats infra with elastic, logstash and kibana" ], "tech": [ "docker", "gcp", "python", "bash", "elastic", "logstash", "kibana", "pytorch", "autotrain", "clearml" ] }, { "year": "2022-2023", "company": "Bytelearn.com", "role": "DevOps Consultant", "location": "remote", "hq": "india", "work": [ "cost optimization on aws, over 77% savings on annual bill", "migration to kubernetes", "migration to gcp from aws", "opentelemetry integration for tracing and monitoring", "high availability monitoring and observability setup for opentelemetry with promscale, tsdb, graphana, prometheus", "multi node mysql and postgres deployment with perconadb", "ci/cd and uptime ownership for 30+ services", "automated testing env setup", "security and compliance audits for gcp and k8s environments" ], "tech": [ "kubernetes", "argocd", "docker", "gcp", "python", "bash", "prometheus", "grafana", "mysql", "postgres", "perconadb", "redis", "tsdb" ] } ], "summary": { "infrastructure": [ "kubernetes", "podman", "docker", "gke", "eks", "argocd", "terraform", "helm", "gcp", "aws", "azure", "istio", "prometheus", "grafana", "github actions" ], "databases": [ "perconadb", "postgresql", "mysql", "redis (cluster)", "mongodb", "elasticsearch", "iceberg", "hudi" ], "ml_ops": [ "clearml", "mlflow", "weights & biases", "autotrain", "pytorch", "tensorflow", "qdrant", "custom llm agents" ], "programming_languages": [ "python", "bash", "rust", "go", "javascript", "typescript" ], "monitoring_logging": [ "prometheus", "grafana", "elastic stack (elk)", "opentelemetry", "logstash", "kibana" ], "ci_cd": [ "github actions", "argocd", "gitlab ci", "circleci" ], "cloud": [ "gcp", "aws", "azure" ], "finops": [ "cost optimization", "billing analysis", "resource tagging" ], "other": [ "custom dev tools", "system design", "automation scripting" ] }, "format_source": "curl https://umgbhalla.xyz | jless" }