diff --git a/api-ref/requirements.txt b/api-ref/requirements.txt deleted file mode 100644 index 23b871f..0000000 --- a/api-ref/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -sphinx>=2.0.0,!=2.1.0 # BSD -otcdocstheme>=1.0.0 # Apache-2.0 -# releasenotes -reno>=3.1.0 # Apache-2.0 diff --git a/api-ref/source/conf.py b/api-ref/source/conf.py deleted file mode 100755 index 83d890b..0000000 --- a/api-ref/source/conf.py +++ /dev/null @@ -1,75 +0,0 @@ -# -*- coding: utf-8 -*- -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys - -sys.path.insert(0, os.path.abspath('../..')) -# -- General configuration ---------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = [ - 'sphinx.ext.autodoc', - 'otcdocstheme', -] - -# autodoc generation is a bit aggressive and a nuisance when doing heavy -# text edit cycles. -# execute "export SPHINX_DEBUG=1" in your terminal to disable - -# The suffix of source filenames. -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = 'internal-documentation' -copyright = '2022, Open Telekom Cloud Developers' - -# If true, '()' will be appended to :func: etc. cross-reference text. -add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -add_module_names = True - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'native' - -# -- Options for HTML output -------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. Major themes that come with -# Sphinx are currently 'default' and 'sphinxdoc'. -# html_theme_path = ["."] -# html_theme = '_theme' -# html_static_path = ['static'] -html_theme = 'otcdocs' - -# Output file base name for HTML help builder. -htmlhelp_basename = '%sdoc' % project - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass -# [howto/manual]). -latex_documents = [ - ('index', - '%s.tex' % project, - '%s Documentation' % project, - 'Open Telekom Cloud Developers', 'manual'), -] - -# Example configuration for intersphinx: refer to the Python standard library. -#intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/api-ref/source/index.rst b/api-ref/source/index.rst deleted file mode 100644 index 30a4f11..0000000 --- a/api-ref/source/index.rst +++ /dev/null @@ -1,3 +0,0 @@ -====================================================== -Welcome to the documentation of internal-documentation -====================================================== diff --git a/dev_guide/requirements.txt b/dev_guide/requirements.txt deleted file mode 100644 index 23b871f..0000000 --- a/dev_guide/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -sphinx>=2.0.0,!=2.1.0 # BSD -otcdocstheme>=1.0.0 # Apache-2.0 -# releasenotes -reno>=3.1.0 # Apache-2.0 diff --git a/dev_guide/source/conf.py b/dev_guide/source/conf.py deleted file mode 100755 index 83d890b..0000000 --- a/dev_guide/source/conf.py +++ /dev/null @@ -1,75 +0,0 @@ -# -*- coding: utf-8 -*- -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys - -sys.path.insert(0, os.path.abspath('../..')) -# -- General configuration ---------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = [ - 'sphinx.ext.autodoc', - 'otcdocstheme', -] - -# autodoc generation is a bit aggressive and a nuisance when doing heavy -# text edit cycles. -# execute "export SPHINX_DEBUG=1" in your terminal to disable - -# The suffix of source filenames. -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = 'internal-documentation' -copyright = '2022, Open Telekom Cloud Developers' - -# If true, '()' will be appended to :func: etc. cross-reference text. -add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -add_module_names = True - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'native' - -# -- Options for HTML output -------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. Major themes that come with -# Sphinx are currently 'default' and 'sphinxdoc'. -# html_theme_path = ["."] -# html_theme = '_theme' -# html_static_path = ['static'] -html_theme = 'otcdocs' - -# Output file base name for HTML help builder. -htmlhelp_basename = '%sdoc' % project - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass -# [howto/manual]). -latex_documents = [ - ('index', - '%s.tex' % project, - '%s Documentation' % project, - 'Open Telekom Cloud Developers', 'manual'), -] - -# Example configuration for intersphinx: refer to the Python standard library. -#intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/dev_guide/source/index.rst b/dev_guide/source/index.rst deleted file mode 100644 index 30a4f11..0000000 --- a/dev_guide/source/index.rst +++ /dev/null @@ -1,3 +0,0 @@ -====================================================== -Welcome to the documentation of internal-documentation -====================================================== diff --git a/doc/source/_static/images/added_new_text.png b/doc/source/_static/images/added_new_text.png new file mode 100755 index 0000000..040243c Binary files /dev/null and b/doc/source/_static/images/added_new_text.png differ diff --git a/doc/source/_static/images/api_calling_process_flow.png b/doc/source/_static/images/api_calling_process_flow.png new file mode 100644 index 0000000..e237161 Binary files /dev/null and b/doc/source/_static/images/api_calling_process_flow.png differ diff --git a/doc/source/_static/images/compare_commits.png b/doc/source/_static/images/compare_commits.png new file mode 100755 index 0000000..2ed6159 Binary files /dev/null and b/doc/source/_static/images/compare_commits.png differ diff --git a/doc/source/_static/images/compare_images.png b/doc/source/_static/images/compare_images.png new file mode 100755 index 0000000..37fa2ff Binary files /dev/null and b/doc/source/_static/images/compare_images.png differ diff --git a/doc/source/_static/images/compare_text.png b/doc/source/_static/images/compare_text.png new file mode 100755 index 0000000..7c95074 Binary files /dev/null and b/doc/source/_static/images/compare_text.png differ diff --git a/doc/source/_static/images/helpcenter_gitops.png b/doc/source/_static/images/helpcenter_gitops.png new file mode 100755 index 0000000..89e54f6 Binary files /dev/null and b/doc/source/_static/images/helpcenter_gitops.png differ diff --git a/doc/source/_static/images/jira_document_pr_link.png b/doc/source/_static/images/jira_document_pr_link.png new file mode 100755 index 0000000..b32cba5 Binary files /dev/null and b/doc/source/_static/images/jira_document_pr_link.png differ diff --git a/doc/source/_static/images/obtain_x-subject-token.png b/doc/source/_static/images/obtain_x-subject-token.png new file mode 100644 index 0000000..7331ada Binary files /dev/null and b/doc/source/_static/images/obtain_x-subject-token.png differ diff --git a/doc/source/_static/images/otc.png b/doc/source/_static/images/otc.png new file mode 100644 index 0000000..8ec4115 Binary files /dev/null and b/doc/source/_static/images/otc.png differ diff --git a/doc/source/_static/images/permissions.png b/doc/source/_static/images/permissions.png new file mode 100644 index 0000000..829cba5 Binary files /dev/null and b/doc/source/_static/images/permissions.png differ diff --git a/doc/source/_static/images/sample_code_project_structure.png b/doc/source/_static/images/sample_code_project_structure.png new file mode 100644 index 0000000..508f4c1 Binary files /dev/null and b/doc/source/_static/images/sample_code_project_structure.png differ diff --git a/doc/source/_static/images/sample_code_select_file.png b/doc/source/_static/images/sample_code_select_file.png new file mode 100644 index 0000000..0adf514 Binary files /dev/null and b/doc/source/_static/images/sample_code_select_file.png differ diff --git a/doc/source/_static/images/sample_code_select_project.png b/doc/source/_static/images/sample_code_select_project.png new file mode 100644 index 0000000..94736e3 Binary files /dev/null and b/doc/source/_static/images/sample_code_select_project.png differ diff --git a/doc/source/_static/images/sdkdemo_properties.png b/doc/source/_static/images/sdkdemo_properties.png new file mode 100644 index 0000000..1cc55fc Binary files /dev/null and b/doc/source/_static/images/sdkdemo_properties.png differ diff --git a/doc/source/_static/images/token_authentication_example_request.png b/doc/source/_static/images/token_authentication_example_request.png new file mode 100644 index 0000000..52cbcd9 Binary files /dev/null and b/doc/source/_static/images/token_authentication_example_request.png differ diff --git a/doc/source/_static/images/viewing_domain_id.png b/doc/source/_static/images/viewing_domain_id.png new file mode 100644 index 0000000..a24ff60 Binary files /dev/null and b/doc/source/_static/images/viewing_domain_id.png differ diff --git a/doc/source/_static/images/viewing_project_ids.jpg b/doc/source/_static/images/viewing_project_ids.jpg new file mode 100644 index 0000000..b0e263d Binary files /dev/null and b/doc/source/_static/images/viewing_project_ids.jpg differ diff --git a/doc/source/index.rst b/doc/source/index.rst index 30a4f11..656cecd 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1,3 +1,8 @@ -====================================================== -Welcome to the documentation of internal-documentation -====================================================== +====================== +Internal Documentation +====================== + +.. toctree:: + :maxdepth: 2 + + training/index diff --git a/doc/source/training/apimon_training/alerts.rst b/doc/source/training/apimon_training/alerts.rst new file mode 100644 index 0000000..4658ae8 --- /dev/null +++ b/doc/source/training/apimon_training/alerts.rst @@ -0,0 +1,110 @@ +====== +Alerts +====== + +Alerta is the component of the ApiMon that is designed to integrate alerts +from multiple sources. It supports many different standard sources like Syslog, +SNMP, Prometheus, Nagios, Zabbix, etc. Additionally any other type of source +using URL request or command line can be integrated as well. + +Native functions like correlation and de-duplication help to manage thousands of +alerts in transparent way and consolidate alerts in proper categories based on +environment, service, resource, failure type, etc. + +Alerta is hosted on https://alerts.eco.tsi-dev.otc-service.com/ . +The authentication is centrally managed by OTC LDAP. + +The Zulip API was integrated with Alerta, to send notification of errors/alerts +on Zulip stream. + +Alerts displayed on OTC Alerta are generated either by Executor, Scheduler, +EpMon or by Grafana. + + - “Executor alerts” focus on playbook results, whether playbook has completed + or failed. + - “Grafana alerts” focus on breaching the defined thresholds. For example API + response time is higher than defined threshold. + - "Scheduler alerts" TBD + - "EpMon alerts" provide information about failed endpoint queries with details + of the request in curl form and the respective error response details + + + +.. image:: training_images/alerta_dashboard.png + + + +Alerts in Alerta are organized in environment tabs based on OTC regions. + + - PRODUCTION EU-DE + - PRODUCTION EU-NL + - HYBRID-SWISS + - ALL + +Every single alert shows 3 views: + +- **Details** - all alert parameters are shown on the single views +- **History** - occurrences of the alert in time (without de-duplication) +- **Data** - extracted error message from the event + + +Alert object consists of the following fields: + ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| Alert Field | Description | ++======================+========================================================================================================================================+ +| **Alert ID** | Reference to alert in Alerta | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Create Time** | Timestamp of alert creation | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Service** | Information about affected service and type of monitoring | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Environment** | Information about affected environment/region | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Resource** | Further details in which particular resource issue has happened | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Event** | Short description of error result | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Correlate** | Currently not in use | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Group** | Further categorization of alerts (currently not used) | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Severity** | Critical - EpMon, Major - ApiMon | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Status** | - **Open** - default status when alert is received in Alerta | +| | - **Ack** - Acknowledged status, indicating that the incident of the service or of the host has been taken into account by a user. | +| | - **Shelve** - change alert status to shelved which removes the alerts from the active console and prevents any further notifications. | +| | - **Close** - change alert status to closed | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Value** | Same like Event field | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Text** | Currently not in use | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Trend Indication** | Currently not in use | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Timeout** | Time after which alert disappears from Alerta (default is 24h) | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Type** | - Apimon Executor Alert - ApiMon related events | +| | - Exception Alert - EpMon related events | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Duplicate count** | De-duplication feature - number of re-occurring same alerts | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Repeat** | If duplicateCount is 0 or the alert status has changed then repeat is False, otherwise it is True | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Origin** | Information about origin location from where the job has been executed | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Tags** | Further details in which particular resource issue has happened | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Log Url** | Reference to job execution output on Swift object storage (only for ApiMon alerts) | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **Log Url Web** | Reference to job execution output on Swift object storage (only for ApiMon alerts) | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ +| **State** | - Present - if alert is still actual | +| | - Present - if alert is not occurring anymore | ++----------------------+----------------------------------------------------------------------------------------------------------------------------------------+ + + + +.. image:: training_images/alerta_detail.jpg + + diff --git a/doc/source/training/apimon_training/contact.rst b/doc/source/training/apimon_training/contact.rst new file mode 100644 index 0000000..7a37812 --- /dev/null +++ b/doc/source/training/apimon_training/contact.rst @@ -0,0 +1,29 @@ +Contact - Whom to address for Feedback? +======================================= + +In case you have any feedback, proposals or found any issues regarding the +ApiMon, EpMon or CloudMon, you can address them in the corresponding GitHub +OpenTelekomCloud-Infra repositories or StackMon repositories. + +Issues or feedback regarding the **ApiMon, EpMon, Status Dashboard, Metric +processor** as well as new feature requests can be addressed by filing an issue +on the **Gihub** repository under +https://github.com/opentelekomcloud-infra/system-config/blob/main/inventory/service/group_vars/apimon.yaml (CMO) +https://github.com/opentelekomcloud-infra/stackmon-config (FMO) + +If you have found any problems which affects the **ApiMon dashboard design** +please open an issue/PR on **GitHub** +https://github.com/opentelekomcloud-infra/system-config/tree/main/playbooks/templates/grafana/apimon (CMO) +https://github.com/stackmon/apimon-tests (FMO) + + +If you have found any problems which affects the **ApiMon playbook scenarios** +please open an issue/PR on **GitHub** +https://github.com/opentelekomcloud-infra/apimon-tests (CMO) +https://github.com/stackmon/apimon-tests (FMO). + +If there is another issue/demand/request try to locate proper repository in +https://github.com/orgs/stackmon/repositories + +For general questions you can write an E-Mail to the `Ecosystems Squad +`_. \ No newline at end of file diff --git a/doc/source/training/apimon_training/dashboards.rst b/doc/source/training/apimon_training/dashboards.rst new file mode 100644 index 0000000..b2e5ae9 --- /dev/null +++ b/doc/source/training/apimon_training/dashboards.rst @@ -0,0 +1,148 @@ +===================== +Dashboards management +===================== + +https://dashboard.tsi-dev.otc-service.com + +The authentication is centrally managed by OTC LDAP. + + +The ApiMon Dashboards are segregated based on the type of service: + + - The “OTC KPI” dashboard provides high level overview about OTC stability and + reliability for management. + - “Endpoint monitoring” dashboard monitors health of every endpoint url listed + by endpoint services catalogue. + - “Respective service statistics” dashboards provide more detailed overview. + - 24/7 Mission Control dashboard used by 24/7 squad for daily monitoring and + addressing the alerts. + - Dashboards can be replicated/customized for individual Squad needs. + + +All the dashboards support Environment (target monitored platform) and Zone +(monitoring source location) variables at the top of each dashboard so these +views can be adjusted based on chosen value. + +.. image:: training_images/dashboards.png + + +OTC KPI Dashboard +================= + +OTC KPI dashboard was requested by management to provide SLA like views on +services including: + + - Global SLI views (Service Level Indicators) of API availability, latency, API errors + - Global SLO views (Service Leven Objectives) + - Service based SLI views of availability, success rate, errors counts, latencies + - Customer service views for specific case like OS boot time duration, server + provisioning failures, volume backup duration, etc + +https://dashboard.tsi-dev.otc-service.com/d/APImonKPI/otc-kpi?orgId=1 + +These views provide immediate status of overall dashboard as well as the status +of the specific service. + +.. image:: training_images/kpi_dashboard.png + + +24/7 Mission control dashboards +=============================== + +24/7 Mission control squads uses CloudMon, ApiMon and EpMon metrics and present +them on their own customized dashboards which are fulfilling their +requirements. + +https://dashboard.tsi-dev.otc-service.com/d/eBQoZU0nk/overview?orgId=1&refresh=1m + +.. image:: training_images/24_7_dashboard.jpg + +Endpoint Monitoring Dashboard +============================= + +Endpoint Monitoring dashboards uses metrics from GET query requests towards OTC +platform (:ref:`EpMon Overview `) and visualize it in: + + - General endpoint availability dashboard + - Endpoint dashboard response times + - No Response dashboard + - Error count dashboard + +https://dashboard.tsi-dev.otc-service.com/d/APImonEPmon/endpoint-monitoring?orgId=1 + + +ApiMon Test Results Dashboard +============================= + +This dashboard summarizes the overall status of the ApiMon playbook scenarios +for all services. The scenarios are fetched in endless loop from github +repository (:ref:`Test Scenarios `), executed and various metrics (:ref:`Metric +Definitions `) are collected. + +https://dashboard.tsi-dev.otc-service.com/d/APImonTestRes/apimon-test-results?orgId=1 + +On this dashboard users can immeditaly identify: + + - count of API errors + - which scenarios are passing, failing, being skipped, + - how long these test scenarios are running + - the list of failed scenarios with links to Ansible playbook output.log + +Based on historical trends and annotations user can identify whether sudden +change in the scenario behavior has been impacted by some planned change on +platform (JIRA annotations) or whether there's some new outage/bug. + +.. image:: training_images/apimon_test_results.jpg + +Service Based Dashboard +======================= + +The dashboard provides deeper insight in single service with tailored views, +graphs and tables to address the service major functionalities abd specifics. + +https://dashboard.tsi-dev.otc-service.com/d/APImonCompute/compute-service-statistics?orgId=1 + +For example in Compute Service Statistics such dashboard include: + + - Success rate of ECS deployments across different availability zones + - Instance boot duration for most common images + - SSH successful logins + - Metadata server latencies and query failures + - API calls duration + - Bad API calls + - Failures in tasks + - Scenario results + +This dashboard should be fully customized by respective responsible squad as +they know best what they need to monitor and check for their service. + + +.. image:: training_images/compute_service_statistics_1.jpg + +.. image:: training_images/compute_service_statistics_2.jpg + +Custom Dashboards +================= + +Previous dashboards are predefined and read-only. +The further customization is currently possible via system-config in github: + +https://github.com/opentelekomcloud-infra/system-config/tree/main/playbooks/templates/grafana/apimon + +The predefined dashboard Jinja templates are stored there and can be customized +in standard gitops way (fork and pull request) In future this process will be +replaced by simplified dashboard panel definition in Stackmon Github +repository (https://github.com/stackmon/apimon-tests/tree/main/dashboards) + +Dashboards can be customized also just by copy/save function directly in +Grafana. So in case of customization of Compute Service Statistics dashboard the +whole dashboard can be saved under new name and then edited without any +restrictions. + +This approach is valid for PoC, temporary solutions and investigations but +should not be used as permanent solution as customized dashboards which are not +properly stored on Github repositories might be permanently deleted in case of +full dashboard service re-installation. + + + diff --git a/doc/source/training/apimon_training/databases.rst b/doc/source/training/apimon_training/databases.rst new file mode 100644 index 0000000..ba11015 --- /dev/null +++ b/doc/source/training/apimon_training/databases.rst @@ -0,0 +1,141 @@ +.. _metric_databases: + +================ +Metric Databases +================ + +Metrics are stored in 2 different database types: + + - Graphite time series database + - Postgresql relational database + + +Graphite +======== + + + `Graphite `_ is an open-source enterprise-ready + time-series database. ApiMon, EpMon, and CloudMon data are stored in the + clustered Graphite TSDB. Metrics emitted by the processes are gathered in the + row of statsd processes which aggregate metrics to 10s precision. + + ++---------------------+-----------------------------------------------------------------------------------------------+ +| Parameter | Value | ++=====================+===============================================================================================+ +| Grafana Datasource | apimon-carbonapi | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Database type | time series | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Main namespace | stats | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Metric type | OpenStack API metrics (including otcextensions) collecting response codes, latencies, methods | +| | ApiMOn metrics (create_cce_cluster, delete_volume_eu-de-01, etc) | +| | Custom metrics which can be created by tags in ansible playbooks | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Database attributes | "timers", "counters", "environment name", "monitoring location", "service", "request method", | +| | "resource", "response code", "result", custom metrics, etc | ++---------------------+-----------------------------------------------------------------------------------------------+ +| result of API calls | attempted | +| | passed | +| | failed | ++---------------------+-----------------------------------------------------------------------------------------------+ + + +.. image:: training_images/graphite_query.jpg + + +All metrics are under "stats" namespace: + +Under "stats" there are following important metric types: + +- counters +- timers +- gauges + +Counters and timers have following subbranches: + +- apimon.metric → specific apimon metrics not gathered by the OpenStack API + methods +- openstack.api → pure API request metrics + +Every section has further following branches: + +- environment name (production_regA, production_regB, etc) + + - monitoring location (production_regA, awx) - specification of the environment from which the metric is gathered + + +openstack.api +------------- + +OpenStack metrics branch is structured as following: + +- service (normally service_type from the service catalog, but sometimes differs slightly) + + - request method (GET/POST/DELETE/PUT) + + - resource (service resource, i.e. server, keypair, volume, etc). Sub-resources are joined with "_" (i.e. cluster_nodes) + + - response code - received response code + + - count/upper/lower/mean/etc - timer specific metrics (available only under stats.timers.openstack.api.$environment.$zone.$service.$request_method.$resource.$status_code.{count,mean,upper,*}) + - count/rate - counter specific metrics (available only under stats.counters.openstack.api.$environment.$zone.$service.$request_method.$resource.$status_code.{count,mean,upper,*}) + + - attempted - counter for the attempted requests (only for counters) + - failed - counter of failed requests (not received response, connection problems, etc) (only for counters) + - passed - counter of requests receiving any response back (only for counters) + + +apimon.metric +------------- + +- metric name (i.e. create_cce_cluster, delete_volume_eu-de-01, etc) - complex metrics branch + + - attempted/failed/failedignored/passed/skipped - counters for the corresponding operation results (this branch element represents status of the corresponding ansible task) + + - $az - some metrics would have availability zone for the operation on that level. Since this info is not always available this is a varying path + +- curl - subtree for the curl type of metrics + + - $name - short name of the host to be checked + + +- stats.timers.apimon.metric.$environment.$zone.**csm_lb_timings**.{public,private}.{http,https,tcp}.$az.__VALUE__ - timer values for the loadbalancer test +- stats.counters.apimon.metric.$environment.$zone.**csm_lb_timings**.{public,private}.{http,https,tcp}.$az.{attempted,passed,failed} - counter values for the loadbalancer test +- stats.timers.apimon.metric.$environment.$zone.**curl**.$host.{passed,failed}.__VALUE__ - timer values for the curl test +- stats.counters.apimon.metric.$environment.$zone.**curl**.$host.{attempted,passed,failed} - counter values for the curl test +- stats.timers.apimon.metric.$environment.$zone.**dns**.$ns_name.$host - timer values for the NS lookup test. $ns_name is the DNS servers used to query the records +- stats.counters.apimon.metric.$environment.$zone.**dns**.$ns_name.$host.{attempted,passed,failed} - counter values for the NS lookup test + + +Postgresql +========== + +Relational database stores ApiMon playbook scenario results which provides statistics about most common service functionalities and use cases. +These queries are used mainly on Test Results dashboard and Service specific statistics dashboards. + + ++-------------------------------+-------------------------------------------------------------------------------------------------------------+ +| Parameter | Value | ++===============================+=============================================================================================================+ +| Grafana Datasource | apimon-pg | ++-------------------------------+-------------------------------------------------------------------------------------------------------------+ +| Database Type | relational | ++-------------------------------+-------------------------------------------------------------------------------------------------------------+ +| Database Table | results_summary | ++-------------------------------+-------------------------------------------------------------------------------------------------------------+ +| Metric type | apimon playbook result statistics | ++-------------------------------+-------------------------------------------------------------------------------------------------------------+ +| Database Fields | "timestamp", "name", "job_id", "result", "duration", "result_task" | ++-------------------------------+-------------------------------------------------------------------------------------------------------------+ +| result field values | 0 - success | +| | 1 - ? | +| | 2 - skipped | +| | 3 - failed | ++-------------------------------+-------------------------------------------------------------------------------------------------------------+ +| result_task object parameters | "timestamp", "name", "job_id", "result", "duration", "action", "environment", "zone", "anonymized_response" | ++-------------------------------+-------------------------------------------------------------------------------------------------------------+ + + +.. image:: training_images/postgresql_query.jpg diff --git a/doc/source/training/apimon_training/difference_cmo_fmo.rst b/doc/source/training/apimon_training/difference_cmo_fmo.rst new file mode 100644 index 0000000..2d2fd66 --- /dev/null +++ b/doc/source/training/apimon_training/difference_cmo_fmo.rst @@ -0,0 +1,34 @@ +.. _difference_apimon_cmo_fmo: + +=================================== +Difference ApiMon(CMO)/ApiMon(FMO) +=================================== + +Due to the ongoing transformation of ApiMon and integration to a more robust +CloudMon there are two operation modes right now. Therefore it's important to +understand what is supported in which mode. + +This page aims to provide navigation links and understand the changes once the +transformation is completed and some of the locations will change. + +The most important differences are described in the table below: + ++---------------------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+ +| **Differences** | **ApiMon (CMO)** | **ApiMon(FMO)** | ++=====================+============================================================================================================+==========================================================================+ +| Playbook scenarios | https://github.com/opentelekomcloud-infra/apimon-test | https://github.com/stackmon/apimon-tests/tree/main/playbooks | ++---------------------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+ +| Dashboards setup | https://github.com/opentelekomcloud-infra/system-config/tree/main/playbooks/templates/grafana/apimon | https://github.com/stackmon/apimon-tests/tree/main/dashboards | ++---------------------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+ +| Environment setup | https://github.com/opentelekomcloud-infra/system-config/blob/main/inventory/service/group_vars/apimon.yaml | https://github.com/opentelekomcloud-infra/stackmon-config | ++---------------------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+ +| Implementation mode | standalone app | plugin based | ++---------------------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+ +| Organization | opentelekomcloud-infra | stackmon | ++---------------------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+ +| Dashboards | https://dashboard.tsi-dev.otc-service.com/ | https://dashboard.tsi-dev.otc-service.com/ | +| | https://dashboard.tsi-dev.otc-service.com/dashboards/f/UaB8meoZk/apimon | https://dashboard.tsi-dev.otc-service.com/dashboards/f/CloudMon/cloudmon | ++---------------------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+ +| Documentation | https://confluence.tsi-dev.otc-service.com/display/ES/API-Monitoring | https://stackmon.github.io/ | +| | | https://stackmon-cloudmon.readthedocs.io/en/latest/index.html | ++---------------------+------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+ diff --git a/doc/source/training/apimon_training/epmon_checks.rst b/doc/source/training/apimon_training/epmon_checks.rst new file mode 100644 index 0000000..8dbb0d8 --- /dev/null +++ b/doc/source/training/apimon_training/epmon_checks.rst @@ -0,0 +1,40 @@ +.. _epmon_overview: + +============================ +Endpoint Monitoring overview +============================ + + +EpMon is a standalone python based process targeting every OTC service. It +finds service in the service catalogs and sends GET requests to the configured +endpoints. + +Performing extensive tests like provisioning a server is giving a great +coverage, but is usually not something what can be performed very often and +leaves certain gaps on the timescale of monitoring. In order to cover this gap +EpMon component is capable to send GET requests to the given URLs relying on the +API discovery of the OpenStack cloud (perform GET request to /servers or the +compute endpoint). Such requests are cheap and can be performed in the loop, i.e. +every 5 seconds. Latency of those calls, as well as the return codes, are being +captured and sent to the metrics storage. + + + +Currently EpMon configuration is located in system-config: +https://github.com/opentelekomcloud-infra/system-config/blob/main/inventory/service/group_vars/apimon.yaml +(this will change in future once CloudMon will take place) + +And defines the query HTTP targets for every single OTC service. + +EpMon dashboard provides general availability status of every service definition +from service catalog: + +.. image:: training_images/epmon_status_dashboard.jpg + +Additionally it provides further details for the endpoints like response times, +detected error codes or no responses at all. + +.. image:: training_images/epmon_dashboard_details.jpg + +EpMon findings are also reported to Alerta and notifications are sent to Zulip +dedicated topic "apimon_endpoint_monitoring". diff --git a/doc/source/training/apimon_training/faq/faq_images/alerta_alerts_detail.png b/doc/source/training/apimon_training/faq/faq_images/alerta_alerts_detail.png new file mode 100644 index 0000000..3f40ac2 Binary files /dev/null and b/doc/source/training/apimon_training/faq/faq_images/alerta_alerts_detail.png differ diff --git a/doc/source/training/apimon_training/faq/faq_images/annotations.jpg b/doc/source/training/apimon_training/faq/faq_images/annotations.jpg new file mode 100644 index 0000000..4481db7 Binary files /dev/null and b/doc/source/training/apimon_training/faq/faq_images/annotations.jpg differ diff --git a/doc/source/training/apimon_training/faq/faq_images/dashboard_log_links.jpg b/doc/source/training/apimon_training/faq/faq_images/dashboard_log_links.jpg new file mode 100644 index 0000000..65d08c0 Binary files /dev/null and b/doc/source/training/apimon_training/faq/faq_images/dashboard_log_links.jpg differ diff --git a/doc/source/training/apimon_training/faq/faq_images/zulip_notification_links.jpg b/doc/source/training/apimon_training/faq/faq_images/zulip_notification_links.jpg new file mode 100644 index 0000000..b724ee4 Binary files /dev/null and b/doc/source/training/apimon_training/faq/faq_images/zulip_notification_links.jpg differ diff --git a/doc/source/training/apimon_training/faq/how_can_i_access_dashboard.rst b/doc/source/training/apimon_training/faq/how_can_i_access_dashboard.rst new file mode 100644 index 0000000..0522222 --- /dev/null +++ b/doc/source/training/apimon_training/faq/how_can_i_access_dashboard.rst @@ -0,0 +1,7 @@ +============================ +How Can I Access Dashboard ? +============================ + +OTC LDAP authentication is supported on +https://dashboard.tsi-dev.otc-service.com. + diff --git a/doc/source/training/apimon_training/faq/how_to_read_the_logs_and_understand_the_issue.rst b/doc/source/training/apimon_training/faq/how_to_read_the_logs_and_understand_the_issue.rst new file mode 100644 index 0000000..95af642 --- /dev/null +++ b/doc/source/training/apimon_training/faq/how_to_read_the_logs_and_understand_the_issue.rst @@ -0,0 +1,80 @@ +.. _working_with_logs: + +============================================= +How To Read The Logs And Understand The Issue +============================================= + + +Logs are stored on swift OBS and they expire after ~1 week. The logs are can be +accessed from multiple locations: + + - Zulip notifications: + + + .. image:: faq_images/zulip_notification_links.jpg + + + - Alerts in Alerta + + + .. image:: faq_images/alerta_alerts_detail.png + + + - Tables in dashboards + + + .. image:: faq_images/dashboard_log_links.jpg + + +The logs contain whole ansible playbook output and help to analyze the problem +in detail. +For example following log detail describes the failed scenario for ECS deployment:: + + 2023-05-17 21:08:09.038955 | TASK [server_create_delete : Try connecting] + 2023-05-17 21:08:09.485569 | localhost | ERROR + 2023-05-17 21:08:09.485862 | localhost | { + 2023-05-17 21:08:09.485922 | localhost | "changed": true, + 2023-05-17 21:08:09.485950 | localhost | "cmd": [ + 2023-05-17 21:08:09.485984 | localhost | "ssh", + 2023-05-17 21:08:09.486016 | localhost | "-o", + 2023-05-17 21:08:09.486052 | localhost | "UserKnownHostsFile=/dev/null", + 2023-05-17 21:08:09.486076 | localhost | "-o", + 2023-05-17 21:08:09.486097 | localhost | "StrictHostKeyChecking=no", + 2023-05-17 21:08:09.486118 | localhost | "linux@80.158.60.117", + 2023-05-17 21:08:09.486138 | localhost | "-i", + 2023-05-17 21:08:09.486160 | localhost | "~/.ssh/scenario2a-162b6915911748c5809474be69d2a3b3-kp.pem" + 2023-05-17 21:08:09.486192 | localhost | ], + 2023-05-17 21:08:09.486221 | localhost | "delta": "0:00:00.127394", + 2023-05-17 21:08:09.486242 | localhost | "end": "2023-05-17 21:08:09.454247", + 2023-05-17 21:08:09.486262 | localhost | "invocation": { + 2023-05-17 21:08:09.486283 | localhost | "module_args": { + 2023-05-17 21:08:09.486314 | localhost | "_raw_params": "ssh -o 'UserKnownHostsFile=/dev/null' -o 'StrictHostKeyChecking=no' linux@80.158.60.117 -i ~/.ssh/scenario2a-162b6915911748c5809474be69d2a3b3-kp.pem", + 2023-05-17 21:08:09.486373 | localhost | "_uses_shell": false, + 2023-05-17 21:08:09.486397 | localhost | "argv": null, + 2023-05-17 21:08:09.486428 | localhost | "chdir": null, + 2023-05-17 21:08:09.486455 | localhost | "creates": null, + 2023-05-17 21:08:09.486487 | localhost | "executable": null, + 2023-05-17 21:08:09.486513 | localhost | "removes": null, + 2023-05-17 21:08:09.486533 | localhost | "stdin": null, + 2023-05-17 21:08:09.486553 | localhost | "stdin_add_newline": true, + 2023-05-17 21:08:09.486573 | localhost | "strip_empty_ends": true, + 2023-05-17 21:08:09.486593 | localhost | "warn": false + 2023-05-17 21:08:09.486613 | localhost | } + 2023-05-17 21:08:09.486633 | localhost | }, + 2023-05-17 21:08:09.486657 | localhost | "msg": "non-zero return code", + 2023-05-17 21:08:09.486689 | localhost | "rc": 255, + 2023-05-17 21:08:09.486713 | localhost | "start": "2023-05-17 21:08:09.326853", + 2023-05-17 21:08:09.486734 | localhost | "stderr": "Pseudo-terminal will not be allocated because stdin is not a terminal.\r\nWarning: Permanently added '80.158.60.117' (ED25519) to the list of known hosts.\r\nlinux@80.158.60.117: Permission denied (publickey).", + 2023-05-17 21:08:09.486755 | localhost | "stderr_lines": [ + 2023-05-17 21:08:09.486776 | localhost | "Pseudo-terminal will not be allocated because stdin is not a terminal.", + 2023-05-17 21:08:09.486808 | localhost | "Warning: Permanently added '80.158.60.117' (ED25519) to the list of known hosts.", + 2023-05-17 21:08:09.486834 | localhost | "linux@80.158.60.117: Permission denied (publickey)." + 2023-05-17 21:08:09.486855 | localhost | ] + 2023-05-17 21:08:09.486875 | localhost | } + +In this case it seems that deployed ECS doesn't contain injected public SSH key +which can point to issue with cloud init or metadata server. + +The playbooks can be run also manually on any OTC tenant and can be used +for further investigation and analysis. + diff --git a/doc/source/training/apimon_training/faq/index.rst b/doc/source/training/apimon_training/faq/index.rst new file mode 100644 index 0000000..533da05 --- /dev/null +++ b/doc/source/training/apimon_training/faq/index.rst @@ -0,0 +1,10 @@ +========================== +Frequently Asked Questions +========================== + +.. toctree:: + :maxdepth: 1 + + how_can_i_access_dashboard + how_to_read_the_logs_and_understand_the_issue + what_are_the_annotations diff --git a/doc/source/training/apimon_training/faq/what_are_the_annotations.rst b/doc/source/training/apimon_training/faq/what_are_the_annotations.rst new file mode 100644 index 0000000..1d0b539 --- /dev/null +++ b/doc/source/training/apimon_training/faq/what_are_the_annotations.rst @@ -0,0 +1,22 @@ +######################### +What Are The Annotations? +######################### + +Annotations provide a way to mark points on the graph with rich events. When you +hover over an annotation you can get event description and event tags. The text +field can include links to other systems with more detail. + +.. image:: faq_images/annotations.jpg + + +In ApiMon Dashboards annotations are used to show the JIRA change issue types +which change the transition from SCHEDULED to IN EXECUTION. This helps to +identify if some JIRA change has negative impact on platform in real time. The +annotations contain several fields which help to correlate the platform behavior +with the respective change directly on the dashboard: + + - JIRA Change issue ID + - Impacted Availability Zone + - Affected Environment + - Main component + - Summary diff --git a/doc/source/training/apimon_training/index.rst b/doc/source/training/apimon_training/index.rst new file mode 100644 index 0000000..792a1b5 --- /dev/null +++ b/doc/source/training/apimon_training/index.rst @@ -0,0 +1,22 @@ +=================== +Apimon Training +=================== + +.. toctree:: + :maxdepth: 1 + + introduction + workflow + monitoring_coverage + test_scenarios + epmon_checks + dashboards + metrics + databases + alerts + notifications + logs + difference_cmo_fmo + contact + recorded_session + faq/index diff --git a/doc/source/training/apimon_training/introduction.rst b/doc/source/training/apimon_training/introduction.rst new file mode 100644 index 0000000..67e5e36 --- /dev/null +++ b/doc/source/training/apimon_training/introduction.rst @@ -0,0 +1,108 @@ +============ +Introduction +============ + +The Open Telekom Cloud is represented to users and customers by the API +endpoints and the various services behind them. Users and operators are +interested in a reliable way to check and verify if the services are actually +available to them via the Internet. While internal monitoring checks on the OTC +backplane are necessary, they are not sufficient to detect failures that +manifest in the interface, network connectivity, or the API logic itself. Also +helpful, but not sufficient are simple HTTP requests to the REST endpoints and +checking for 200 status codes. + +The ApiMon is Open Telekom Cloud product developed by +Ecosystem squad. + +The ApiMon a.k.a API-Monitoring project: + + - Developed with aim to supervise 24/7 the public APIs of OTC platform. + - Requests repeatedly sent to the API. + - Requests grouped in so-called scenarios, mimicking real-world use + cases. + - Use cases are implemented as Ansible playbooks. + - Easy to extend the API-Monitoring for other use cases like + monitoring the provisioning of extra VMs or deploying extra software. + + +.. image:: https://stackmon.github.io/assets/images/solution-diagram.svg + +ApiMon Architecture Summary +--------------------------- + + - Test Scenarios are implemented as ansible playbooks and pushed to + `Github `_. + + - EpMon executes various HTTP query requests towards service endpoints and + generates statistics + - Scheduler fetches the latest playbooks from repo and puts them in a + queue to run in a endless loop. + - Executor is running the playbooks from queue and capturing the metrics + - The ansible playbook results generates the metrics (duration, result). + - Test scenarios metrics are sent to postgresql relational database. + - The HTTP requests metrics (generated by OpenStackSDK) are collected by + statsd. + - Time Series database (graphite) is pulling metrics from statsd. + - Grafana dashboards visualize data from postgresql and graphite. + - Alerta monitoring is used for rasing Alarms when API times out, returns error + or response time exceeds threshold. + - Alerta further sends error notification on Zulip #Alerts Stream. + - Log Files are maintained on OTC object storage via swift. + +ApiMon features +--------------- + +ApiMon comes with the following features: + +- Support of ansible playbooks for testing scenarios +- Support of HTTP requests (GET) for Endpoint Monitoring +- Support of TSDB and RDB +- Support of all OTC environments + + - EU-DE + - EU-NL + - Swisscloud + - PREPROD + +- Support of multiple Monitoring sources: + + - internal (OTC) + - external (vCloud) + +- Alerts aggregated in Alerta and notifications sent to zulip +- Various dashboards + + - KPI dashboards + - 24/7 squad dashboards + - General test results dashboards + - Specific squad/service based dashboards + +- Each squad can control and manage their test scenarios and dashboards +- Every execution of ansible playbooks stores the log file for further + investigation/analysis on swift object storage + + +What ApiMon is NOT +------------------ + +The following items are out of scope (while some of them are technically +possible): + +- No performance monitoring: The API-Monitoring does not measure degradations of + performance per se. So measuring the access times or data transfer rates of an + SSD disk is out of scope. However, if the performance of a resource drops + under some threshold that is considered as equivalent to non-available, this + is reported. +- No application monitoring: The service availability of applications that run + on top of IaaS or PaaS of the cloud is out of scope. +- No view from inside: The API-Monitoring has no internal backplane insights and + only uses public APIs of the monitored cloud. It requires thus no + administrative permissions on the backend. It can be, however, deployed + additionally in the backplane to monitor additionally internal APIs. +- No synthetic workloads: The service is not simulating any workloads (for + example a benchmark suite) on the provisioned resources. Instead it measures + and reports only if APIs are available and return expected results with an + expected behavior. +- No every single API monitoring .The API-Monitoring focuses on basic API + functionality of selected components. It doesn't cover every single API call + available in OTC API product portfolio. diff --git a/doc/source/training/apimon_training/logs.rst b/doc/source/training/apimon_training/logs.rst new file mode 100644 index 0000000..68d46f9 --- /dev/null +++ b/doc/source/training/apimon_training/logs.rst @@ -0,0 +1,45 @@ +.. _logs: + +==== +Logs +==== + + +- Every single job run log is stored on OpenStack Swift object storage. +- Each single job log file provides unique URL which can be accessed to see log + details +- These URLs are available on all ApiMon levels: + + - In Zulip alarm messages + - In Alerta events + - In Grafana Dashboards + +- Logs are simple plain text files of the whole playbook output:: + + 2020-07-12 05:54:04.661170 | TASK [List Servers] + 2020-07-12 05:54:09.050491 | localhost | ok + 2020-07-12 05:54:09.067582 | TASK [Create Server in default AZ] + 2020-07-12 05:54:46.055650 | localhost | MODULE FAILURE: + 2020-07-12 05:54:46.055873 | localhost | Traceback (most recent call last): + 2020-07-12 05:54:46.057441 | localhost | + 2020-07-12 05:54:46.057499 | localhost | During handling of the above exception, another exception occurred: + 2020-07-12 05:54:46.057535 | localhost | + … + 2020-07-12 05:54:46.063992 | localhost | File "/tmp/ansible_os_server_payload_uz1c7_iw/ansible_os_server_payload.zip/ansible/modules/cloud/openstack/os_server.py", line 500, in _create_server + 2020-07-12 05:54:46.065152 | localhost | return self._send_request( + 2020-07-12 05:54:46.065186 | localhost | File "/root/.local/lib/python3.8/site-packages/keystoneauth1/session.py", line 1020, in _send_request + 2020-07-12 05:54:46.065334 | localhost | raise exceptions.ConnectFailure(msg) + 2020-07-12 05:54:46.065378 | localhost | keystoneauth1.exceptions.connection.ConnectFailure: Unable to establish connection to https://ims.eu-de.otctest.t-systems.com/v2/images: ('Connection aborted.', OSError(107, 'Transport endpoint is not connected')) + 2020-07-12 05:54:46.295035 | + 2020-07-12 05:54:46.295241 | TASK [Delete server] + 2020-07-12 05:54:48.481374 | localhost | ok + 2020-07-12 05:54:48.505761 | + 2020-07-12 05:54:48.505906 | TASK [Delete SecurityGroup] + 2020-07-12 05:54:50.727174 | localhost | changed + 2020-07-12 05:54:50.745541 | + + +For further details how to work with logs please refer to +:ref:`How To Read The Logs And Understand The Issue ` FAQ +page. + diff --git a/doc/source/training/apimon_training/metrics.rst b/doc/source/training/apimon_training/metrics.rst new file mode 100644 index 0000000..ebf7e7c --- /dev/null +++ b/doc/source/training/apimon_training/metrics.rst @@ -0,0 +1,57 @@ +.. _metrics_definition: + +======= +Metrics +======= + +The Ansible playbook scenarios generate metrics in two ways: + +- The Ansible playbook internally invokes method calls to **OpenStack SDK + libraries.** They in turn generate metrics about each API call they do. This + requires some special configuration in the clouds.yaml file (currently + exposing metrics into statsd and InfluxDB is supported). For details refer + to the `config + documentation `_ + of the OpenStack SDK. The following metrics are captured: + + - response HTTP code + - duration of API call + - name of API call + - method of API call + - service type + +- Ansible plugins may **expose additional metrics** (i.e. whether the overall + scenario succeed or not) with help of `callback + plugin `_. + Since sometimes it is not sufficient to know only the timings of each API + call, Ansible callbacks are utilized to report overall execution time and + result (whether the scenario succeeded and how long it took). The following + metrics are captured: + + - test case + - playbook name + - environment + - action name + - result code + - result string + - service type + - state type + - total amount of (failed, passed, ignored, skipped tests) + +Custom metrics: + +In some situations more complex metric generation is required which consists of +execution of multiple tasks in scenario. For such cases, the tags parameter is +used. Once the specific tasks in playbook are tagged with some specific metric +name the metrics are calculated as sum of all executed tasks with respective +tag. It's useful in cases where the measured metric contains multiple steps to +achieve the desired state of service or service resource. For example, boot up of +virtual machine from deployment until successful login via SSH. + +.. code-block:: + + tags: ["metric=delete_server"] + tags: ["az={{ availability_zone }}", "service=compute", "metric=create_server{{ metric_suffix }}"] + +More details how to query metrics from databases are described on :ref:`Metric +databases ` page. diff --git a/doc/source/training/apimon_training/monitoring_coverage.rst b/doc/source/training/apimon_training/monitoring_coverage.rst new file mode 100644 index 0000000..f7d225b --- /dev/null +++ b/doc/source/training/apimon_training/monitoring_coverage.rst @@ -0,0 +1,51 @@ +=================== +Monitoring coverage +=================== + +Multiple factors define the monitoring coverage to simulate common customer use +cases. + + +Monitored locations +################### + +* EU-DE +* EU-NL +* PREPROD (EU_DE) +* EU-CH2 (Swisscloud) + + +Monitoring sources +################## + +* Inside OTC (eu-de, eu-ch2) +* Outside OTC (Swisscloud) + + +Monitored targets +################# + +* Endpoints and HTTP query requests + + * all services + * multiple GET queries + +* Static Resources + + * specific services + * availability of the resource or resource functionality + +* Dynamic resources + + * ansible playbooks + * specific services + * monitoring of most common use cases in cloud services + + +Monitoring dashboards +##################### + +* KPI dashboards +* 24/7 dashboards +* Test results dashboards +* Specific service dashboards diff --git a/doc/source/training/apimon_training/notifications.rst b/doc/source/training/apimon_training/notifications.rst new file mode 100644 index 0000000..123c0bd --- /dev/null +++ b/doc/source/training/apimon_training/notifications.rst @@ -0,0 +1,68 @@ +============= +Notifications +============= + +Zulip as official OTC communication channel supports API interface for pushing +the notifications from ApiMon to various Zulip streams: + + - #Alerts Stream + - #Alerts-Hybrid Stream + - #Alerts-Preprod Stream + +Every stream contains topics based on the service type (if represented by +standalone Ansible playbook) and general apimon_endpoint_monitor topic which +contains alerts of GET queries towards all services. + + +.. image:: training_images/zulip_notifications.png + + +If the error has been acknowledged on Alerta, the new notification message for +repeating error won't get posted again on Zulip. + +Notifications contain further details which help to identify root cause faster +and more effectively. + +Notification parameters +####################### + +The ApiMon notification consists of several fields: + ++---------------------------+------------------------------------------------------------------------+ +| Notification Field | Description | ++===========================+========================================================================+ +| **APIMon Alert link** | Reference to alert in Alerta | ++---------------------------+------------------------------------------------------------------------+ +| **Status** | Status of the alert in Alerta | ++---------------------------+------------------------------------------------------------------------+ +| **Environment** | Information about affected environment/region | ++---------------------------+------------------------------------------------------------------------+ +| **Severity** | Severity of the alarm | ++---------------------------+------------------------------------------------------------------------+ +| **Origin** | Information about origin location from where the job has been executed | ++---------------------------+------------------------------------------------------------------------+ +| **Service** | Information about affected service and type of monitoring | ++---------------------------+------------------------------------------------------------------------+ +| **Resource** | Further details in which particular resource issue has happened | ++---------------------------+------------------------------------------------------------------------+ +| **Error message Summary** | Short description of error result | ++---------------------------+------------------------------------------------------------------------+ +| **Execution Log link** | Reference to job execution output on Swift object storage | ++---------------------------+------------------------------------------------------------------------+ + +Th EpMon notification consists of several fields: + ++----------------------------+------------------------------------------------------------------+ +| Notification Field | Description | ++============================+==================================================================+ +| **APIMon Alert link** | Reference to alert in Alerta | ++----------------------------+------------------------------------------------------------------+ +| **Environment** | Information about affected environment/region | ++----------------------------+------------------------------------------------------------------+ +| **Curl command** | Interpreted request in curl format for reproducible applications | ++----------------------------+------------------------------------------------------------------+ +| **Request error response** | Error result of the requested API call | ++----------------------------+------------------------------------------------------------------+ + + + diff --git a/doc/source/training/apimon_training/recorded_session.rst b/doc/source/training/apimon_training/recorded_session.rst new file mode 100644 index 0000000..39e19f7 --- /dev/null +++ b/doc/source/training/apimon_training/recorded_session.rst @@ -0,0 +1,14 @@ +.. _recorded_session: + +================ +Recorded Session +================ + +Session from 26.05.2023 has been recorded and videos are available on OBS. + +`Part 1 `_ + +`Part 2 `_ + +`Part 3 `_ + diff --git a/doc/source/training/apimon_training/test_scenarios.rst b/doc/source/training/apimon_training/test_scenarios.rst new file mode 100644 index 0000000..b87c3de --- /dev/null +++ b/doc/source/training/apimon_training/test_scenarios.rst @@ -0,0 +1,199 @@ +.. _test_scenarios: + +============== +Test Scenarios +============== + + +The Executor role of each API-Monitoring environment is responsible for +executing individual jobs (scenarios). Those can be defined as Ansible playbooks +(what allow them to be pretty much anything) or any other executable form (as +python script). With Ansible on it's own having nearly limitless capability and +availability to execute anything else ApiMon can do pretty much anything. The +only expectation is that whatever is being done produces some form of metric for +further analysis and evaluation. Otherwise there is no sense in monitoring. The +scenarios are collected in a `Github +`_ and updated in +real-time. In general mentioned test jobs do not need take care of generating +data implicitly. Since the API related tasks in the playbooks rely on the Python +OpenStack SDK (and its OTC extensions), metric data generated automatically by a +logging interface of the SDK ('openstack_api' metrics). Those metrics are +collected by statsd and stored to :ref:`graphite TSDB `. + +Additionally metric data are generated also by executor service which collects +the playbook names, results and duration time ('ansible_stats' metrics) and +stores them to :ref:`postgresql relational database `. + +The playbooks with monitoring scenarios are stored in separate repository on +`Github `_ (the location +will change with CloudMon replacement in `future +`_). Playbooks address the most common use cases +with cloud services conducted by end customers. + +The metrics generated by Executor are described on :ref:`Metric +Definitions ` page. + +In addition to metrics generated and captured by a playbook ApiMon also captures +:ref:`stdout of the execution `. and saves this log for additional +analysis to OpenStack Swift storage where logs are being uploaded there with a +configurable retention policy. + + +New Test Scenario introduction +============================== + +As already mentioned playbook scenarios are stored in separate repository on +`Github `_. Due to the +fact that we have various environments which differ between each other by +location, supported services, different flavors, etc it's required to have +monitoring configuration matrix which defines the monitoring standard and scope +for each environment. Therefore to enable playbook in some of the monitored +environments (PROD EU-DE, EU-NL, PREPROD, Swisscloud) further update is required +in the `monitoring matrix +`_. +This will be also matter of change in future once `StackMon +`_ will take place. + + +Rules for Test Scenarios +======================== + +Ansible playbooks need to follow some basic regression testing principles to +ensure sustainability of the endless execution of such scenarios: + +- **OpenTelekomCloud and OpenStack collection** + + - When developing test scenarios use available `Opentelekomcloud.Cloud + `_ or + `Openstack.Cloud + `_ + collections for native interaction with cloud in ansible. + - In case there are features not supported by collection you can still use + script module and call directly python SDK script to invoke required request + towards cloud + +- **Unique names of resources** + + - Make sure that resources don't conflict with each other and are easily + trackable by its unique name + +- **Teardown of the resources** + + - Make sure that deletion / cleanup of the resources is triggered even if some + of the tasks in playbooks will fail + - Make sure that deletion / cleanup is triggered in right order + +- **Simplicity** + + - Do not over-complicate test scenario. Use default auto-filled parameters + wherever possible + +- **Only basic / core functions in scope of testing** + + - ApiMon is not supposed to validate full service functionality. For such + cases we have different team / framework within QA responsibility + - Focus only on core functions which are critical for basic operation / + lifecycle of the service. + - The less functions you use the less potential failure rate you will have on + running scenario for whatever reasons + +- **No hardcoding** + + - Every single hardcoded parameter in scenario will later lead to potential + outage of the scenario's run in future when such parameter might change + - Try to obtain all such parameters dynamically from the cloud directly. + +- **Special tags for combined metrics** + + - In case that you want to combine multiple tasks in playbook in single custom + metric you can do with using tags parameter in the tasks + + +Custom metrics in Test Scenarios +================================ + + +OpenStack SDK and otcextensions (otcextensions covers services which are out of +scope of OpenStack SDK and extends its functionality with services provided by +OTC) support metric generation natively for every single API call and ApiMon +executor supports collection of ansible playbook statistics so every single +scenario and task can store its result, duration and name in metric database. + +But in some cases there's a need to provide measurement on multiple tasks which +represent some important aspect of the customer use case. For example measure +the time and overall result from the VM deployment until successful login via +SSH. Single task results are stored as metrics in metric database but it would +be complicated to transfer processing logic of metrics on grafana. Therefore +tags feature on task level introduces possibility to address custom metrics. + + +In following example (snippet from `scenario2_simple_ece.yaml +`_) +custom metric stores the result of multiple tasks in special metric name +create_server:: + + - name: Create Server in default AZ + openstack.cloud.server: + auto_ip: false + name: "{{ test_server_fqdn }}" + image: "{{ test_image }}" + flavor: "{{ test_flavor }}" + key_name: "{{ test_keypair_name }}" + network: "{{ test_network_name }}" + security_groups: "{{ test_security_group_name }}" + tags: + - "metric=create_server" + - "az=default" + register: server + + - name: get server id + set_fact: + server_id: "{{ server.id }}" + + - name: Attach FIP + openstack.cloud.floating_ip: + server: "{{ server_id }}" + tags: + - "metric=create_server" + - "az=default" + + - name: get server info + openstack.cloud.server_info: + server: "{{ server_id }}" + register: server + tags: + - "metric=create_server" + - "az=default" + + - set_fact: + server_ip: "{{ server['openstack_servers'][0]['public_v4'] }}" + tags: + - "metric=create_server" + - "az=default" + + - name: find servers by name + openstack.cloud.server_info: + server: "{{ test_server_fqdn }}" + register: servers + tags: + - "metric=create_server" + - "az=default" + + - name: Debug server info + debug: + var: servers + + # Wait for the server to really start and become accessible + - name: Wait for SSH port to become active + wait_for: + port: 22 + host: "{{ server_ip }}" + timeout: 600 + tags: ["az=default", "service=compute", "metric=create_server"] + + - name: Try connecting + retries: 10 + delay: 1 + command: "ssh -o 'UserKnownHostsFile=/dev/null' -o 'StrictHostKeyChecking=no' linux@{{ server_ip }} -i ~/.ssh/{{ test_keypair_name }}.pem" + tags: ["az=default", "service=compute", "metric=create_server"] + diff --git a/doc/source/training/apimon_training/training_images/24_7_dashboard.jpg b/doc/source/training/apimon_training/training_images/24_7_dashboard.jpg new file mode 100644 index 0000000..65f2ac6 Binary files /dev/null and b/doc/source/training/apimon_training/training_images/24_7_dashboard.jpg differ diff --git a/doc/source/training/apimon_training/training_images/alerta_alerts.png b/doc/source/training/apimon_training/training_images/alerta_alerts.png new file mode 100644 index 0000000..31516de Binary files /dev/null and b/doc/source/training/apimon_training/training_images/alerta_alerts.png differ diff --git a/doc/source/training/apimon_training/training_images/alerta_dashboard.png b/doc/source/training/apimon_training/training_images/alerta_dashboard.png new file mode 100644 index 0000000..c255812 Binary files /dev/null and b/doc/source/training/apimon_training/training_images/alerta_dashboard.png differ diff --git a/doc/source/training/apimon_training/training_images/alerta_detail.jpg b/doc/source/training/apimon_training/training_images/alerta_detail.jpg new file mode 100644 index 0000000..0016ef1 Binary files /dev/null and b/doc/source/training/apimon_training/training_images/alerta_detail.jpg differ diff --git a/doc/source/training/apimon_training/training_images/apimon_data_flow.svg b/doc/source/training/apimon_training/training_images/apimon_data_flow.svg new file mode 100644 index 0000000..a57638b --- /dev/null +++ b/doc/source/training/apimon_training/training_images/apimon_data_flow.svg @@ -0,0 +1,4 @@ + + + +

Scheduler


running
8 parallel
threads
Scheduler...
Add next playbook from the queue when thread is free
Add next playbook...
Graphite TSDB



Graphite TSDB...
Fill in playbooks to the queue of threads
Fill in playboo...
Execute ansible playbooks
Execute ansible...
Remove completed playbook from the thread
Remove complete...

Statsd


Collects the
metrics
Statsd...

Executor


Ansible
Executor...
Send metrics to graphite
Send metrics to...
Service Squad
Servic...
If playbook
failed raise alert
If playbook...
Store the
job logs to
object storage
Store the...
Data
Sources
Data...
Create Alerts based on Thresholds
Create Alerts...
O/M
O/M

Github


apimon tests
repository
Github...
Pull
repository

Pull...
Management
Manage...
Endless loop
Endless loop

Grafana


Dashboard
Grafana...

Alerta


Dashboard
Alerta...
Send notifications to Zulip
Send notificati...

Zulip


Alerts
Alerts-Hybrid
Alerts-Preprod
Zulip...
Swift

Swift
Postgresql RDB



Postgresql RDB...
Test results
Test resul...
Metrics
Metrics

Scheduler


running
8 parallel
threads
Scheduler...
1
1
2
2
3
3
4
4
6
6
5
5
7
7
8
8
9
9
10
10
11
11
Text is not SVG - cannot display
\ No newline at end of file diff --git a/doc/source/training/apimon_training/training_images/apimon_test_results.jpg b/doc/source/training/apimon_training/training_images/apimon_test_results.jpg new file mode 100644 index 0000000..f6eb863 Binary files /dev/null and b/doc/source/training/apimon_training/training_images/apimon_test_results.jpg differ diff --git a/doc/source/training/apimon_training/training_images/compute_service_statistics_1.jpg b/doc/source/training/apimon_training/training_images/compute_service_statistics_1.jpg new file mode 100644 index 0000000..8275ea5 Binary files /dev/null and b/doc/source/training/apimon_training/training_images/compute_service_statistics_1.jpg differ diff --git a/doc/source/training/apimon_training/training_images/compute_service_statistics_2.jpg b/doc/source/training/apimon_training/training_images/compute_service_statistics_2.jpg new file mode 100644 index 0000000..fc96990 Binary files /dev/null and b/doc/source/training/apimon_training/training_images/compute_service_statistics_2.jpg differ diff --git a/doc/source/training/apimon_training/training_images/dashboards.png b/doc/source/training/apimon_training/training_images/dashboards.png new file mode 100644 index 0000000..3237d0a Binary files /dev/null and b/doc/source/training/apimon_training/training_images/dashboards.png differ diff --git a/doc/source/training/apimon_training/training_images/epmon_dashboard_details.jpg b/doc/source/training/apimon_training/training_images/epmon_dashboard_details.jpg new file mode 100644 index 0000000..9b61729 Binary files /dev/null and b/doc/source/training/apimon_training/training_images/epmon_dashboard_details.jpg differ diff --git a/doc/source/training/apimon_training/training_images/epmon_status_dashboard.jpg b/doc/source/training/apimon_training/training_images/epmon_status_dashboard.jpg new file mode 100644 index 0000000..414b40a Binary files /dev/null and b/doc/source/training/apimon_training/training_images/epmon_status_dashboard.jpg differ diff --git a/doc/source/training/apimon_training/training_images/graphite_query.jpg b/doc/source/training/apimon_training/training_images/graphite_query.jpg new file mode 100644 index 0000000..1321faa Binary files /dev/null and b/doc/source/training/apimon_training/training_images/graphite_query.jpg differ diff --git a/doc/source/training/apimon_training/training_images/kpi_dashboard.png b/doc/source/training/apimon_training/training_images/kpi_dashboard.png new file mode 100644 index 0000000..e179a98 Binary files /dev/null and b/doc/source/training/apimon_training/training_images/kpi_dashboard.png differ diff --git a/doc/source/training/apimon_training/training_images/postgresql_query.jpg b/doc/source/training/apimon_training/training_images/postgresql_query.jpg new file mode 100644 index 0000000..9ecbff9 Binary files /dev/null and b/doc/source/training/apimon_training/training_images/postgresql_query.jpg differ diff --git a/doc/source/training/apimon_training/training_images/zulip_notifications.png b/doc/source/training/apimon_training/training_images/zulip_notifications.png new file mode 100644 index 0000000..024f644 Binary files /dev/null and b/doc/source/training/apimon_training/training_images/zulip_notifications.png differ diff --git a/doc/source/training/apimon_training/workflow.rst b/doc/source/training/apimon_training/workflow.rst new file mode 100644 index 0000000..3fa1c6d --- /dev/null +++ b/doc/source/training/apimon_training/workflow.rst @@ -0,0 +1,28 @@ +.. _apimon_flow: + +ApiMon Flow Process +=================== + + +.. image:: training_images/apimon_data_flow.svg + :target: training_images/apimon_data_flow.svg + :alt: apimon_data_flow + + +#. Service squad adds test scenario to github repository. +#. Scheduler fetches test scenarios from Github and add them to queue. +#. Executor plays Ansible test scenario playbooks. Up to 8 parallel threads are enabled +#. Test scenario which has finished is being removed from the thread and next + playbook in the queue is added to the free thread. The previous playbook is + added to the queue on the last position. +#. Test scenario statistics are stored in the Postgresql database. +#. Metrics from HTTP requests are collected by Statsd. +#. Collected metrics are stored in time-series database Graphite. +#. Grafana uses metrics and statistics databases as the data sources for the + dashboards. The dashboard with various panels show the real-time status of + the platform. Grafana supports also historical views and trends. +#. Breached thresholds as well as failed test scenarios result in generated + alerts on Alerta. +#. Notifications containing alert details are sent to Zulip +#. Every test scenario stores it's job output log into Swift object storage for further analysis and investigation. + diff --git a/doc/source/training/helpcenter_training/contact.rst b/doc/source/training/helpcenter_training/contact.rst new file mode 100644 index 0000000..1aacc33 --- /dev/null +++ b/doc/source/training/helpcenter_training/contact.rst @@ -0,0 +1,18 @@ +Contact - Whom to address for Feedback? +======================================= + +In case you have any feedback, proposals or found any issues regarding the HelpCenter 3.0, you can address them in the corresponding GitHub or Gitea repositories. + +Issues or feedback regarding the **internal Helpcenter** as well as new feature requests can be addressed by filing an issue on the internal **Gitea** repository under https://gitea.eco.tsi-dev.otc-service.com/docs/docsportal/issues + +If a documentation to a service is missing, please open a ticket on the doc-exports repository: https://gitea.eco.tsi-dev.otc-service.com/docs/doc-exports/issues + +If you have found any problems which affects the **production Helpcenter** please instead open an issue on **GitHub**. + +- In case it is a general issue, create it in the docsportal repository: https://github.com/opentelekomcloud-docs/docsportal/issues +- If you have found anything wrong regarding a specific documentation, please open an issue on the corresponding repository under opentelekomcloud-docs. For example, on ECS you can open an issue here: https://github.com/opentelekomcloud-docs/elastic-cloud-server/issues + +If there is a request for completely new service or new document type (which was not yet introduced to HelpCenter 3.0) then please open an issue on the otc-metadata repository: https://gitea.eco.tsi-dev.otc-service.com/infra/otc-metadata/issues + + +For general questions you can write an E-Mail to the `Ecosystems Squad `_. \ No newline at end of file diff --git a/doc/source/training/helpcenter_training/difference_gitea_github.rst b/doc/source/training/helpcenter_training/difference_gitea_github.rst new file mode 100644 index 0000000..f0724e0 --- /dev/null +++ b/doc/source/training/helpcenter_training/difference_gitea_github.rst @@ -0,0 +1,51 @@ +.. _difference_gitea_github: + +========================= +Difference Gitea / Github +========================= + +Due to the several requirements on Huawei and TSI side 2 gitops stages introduced. At first stage Huawei imports documentation to Gitea and TSI review and approve it. +Afterwards documentation change is introduced to Github and TSI formally review it and approve. + +But there are few more differences which are described in the table below: + ++----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+ +| **Differences** | **Gitea** | **Github** | ++====================================================+====================================================+====================================================+ +| Link | https://gitea.eco.tsi-dev.otc-service.com/docs | https://github.com/opentelekomcloud-docs/ | ++----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+ +| Environment | PREPROD | PROD | ++----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+ +| | internal | public | ++----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+ +| Who can introduce changes | Huawei | Anyone (TSI, Huawei, customer) | ++----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+ +| Source of documentation | Huawei | TSI+Huawei | ++----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+ +| Form of change | Overwrite | Diff | ++----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+ +| Portal | https://docs-int.otc-service.com | https://docs.otc.t-systems.com | ++----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+ +| Document types | - UMN, API, DEV, other public facing documents | - UMN, API, DEV, other public facing documents | +| | | | +| | - PD, HLD, CDR, other internal documents | | ++----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+ +| Stages | 1. Import of the documentation change by Huawei in | 1. Review of the documentation change in target | +| | doc-exports repo (html) | document repository | +| | | | +| | 2. Conversion of the documentation to target | 2. Resolve potential conflicts | +| | document repository (rst) | | +| | | 3. Approve and gate documentation change in target | +| | 3. Review of the documentation change in target | document repository | +| | document repository | | +| | | | +| | 4. Approve and gate documentation change in target | | +| | document repository | | +| | | | +| | 5. Approve and gate documentation change in | | +| | doc-exports repo | | +| | | | +| | 6. Zuul automatically creates documentation change | | +| | in Github repo | | ++----------------------------------------------------+----------------------------------------------------+----------------------------------------------------+ + diff --git a/doc/source/training/helpcenter_training/faq/are_there_any_plans_to_move_other_documents_cdr_hld_and_pd_as_well_to_the_platform_so_we_can_handle_all_documents_in_one_place.rst b/doc/source/training/helpcenter_training/faq/are_there_any_plans_to_move_other_documents_cdr_hld_and_pd_as_well_to_the_platform_so_we_can_handle_all_documents_in_one_place.rst new file mode 100644 index 0000000..b9e48b6 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/are_there_any_plans_to_move_other_documents_cdr_hld_and_pd_as_well_to_the_platform_so_we_can_handle_all_documents_in_one_place.rst @@ -0,0 +1,9 @@ +================================================================================================================================= +Are there any plans to move other documents CDR, HLD, and PD as well to the platform so we can handle all documents in one place? +================================================================================================================================= + +Yes, the plan for this year is: + +- Start integration of Hybrid Documentation to Help Center 3.0. Kevin Heyong has already confirmed that he initiates the talks to R&D. + +- Start integration of internal Huawei documents to Help Center 3.0 (this will be a bit challenging as this sort of documents are not present in Huawei documentation system and R&D is taking care of them by their own so the only existing source of the documentation is doc word type) diff --git a/doc/source/training/helpcenter_training/faq/faq_images/gate_label.png b/doc/source/training/helpcenter_training/faq/faq_images/gate_label.png new file mode 100644 index 0000000..ae52bed Binary files /dev/null and b/doc/source/training/helpcenter_training/faq/faq_images/gate_label.png differ diff --git a/doc/source/training/helpcenter_training/faq/faq_images/html_preview_1.png b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_1.png new file mode 100644 index 0000000..6207a88 Binary files /dev/null and b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_1.png differ diff --git a/doc/source/training/helpcenter_training/faq/faq_images/html_preview_2.png b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_2.png new file mode 100644 index 0000000..9f745cf Binary files /dev/null and b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_2.png differ diff --git a/doc/source/training/helpcenter_training/faq/faq_images/html_preview_3.png b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_3.png new file mode 100644 index 0000000..9e78b2b Binary files /dev/null and b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_3.png differ diff --git a/doc/source/training/helpcenter_training/faq/faq_images/html_preview_4.png b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_4.png new file mode 100644 index 0000000..307e6e9 Binary files /dev/null and b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_4.png differ diff --git a/doc/source/training/helpcenter_training/faq/faq_images/html_preview_5.png b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_5.png new file mode 100644 index 0000000..577a1e3 Binary files /dev/null and b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_5.png differ diff --git a/doc/source/training/helpcenter_training/faq/faq_images/html_preview_6.png b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_6.png new file mode 100644 index 0000000..0cb081f Binary files /dev/null and b/doc/source/training/helpcenter_training/faq/faq_images/html_preview_6.png differ diff --git a/doc/source/training/helpcenter_training/faq/how_and_where_should_we_submit_bugs_when_a_non-conformity_is_found_in_an_already_released_deployed_prod_documentation.rst b/doc/source/training/helpcenter_training/faq/how_and_where_should_we_submit_bugs_when_a_non-conformity_is_found_in_an_already_released_deployed_prod_documentation.rst new file mode 100644 index 0000000..113622b --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_and_where_should_we_submit_bugs_when_a_non-conformity_is_found_in_an_already_released_deployed_prod_documentation.rst @@ -0,0 +1,5 @@ +====================================================================================================================== +How and where should we submit bugs when a non-conformity is found in an already released/deployed PROD documentation? +====================================================================================================================== + +Using Gitea/Github issues on respective repositories already mentioned in :ref:`very first question `. diff --git a/doc/source/training/helpcenter_training/faq/how_and_where_should_we_submit_bugs_when_the_documentation_url_is_wrong_the_link_is_not_working.rst b/doc/source/training/helpcenter_training/faq/how_and_where_should_we_submit_bugs_when_the_documentation_url_is_wrong_the_link_is_not_working.rst new file mode 100644 index 0000000..e12f6c2 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_and_where_should_we_submit_bugs_when_the_documentation_url_is_wrong_the_link_is_not_working.rst @@ -0,0 +1,5 @@ +================================================================================================== +How and where should we submit bugs when the documentation URL is wrong (the link is not working)? +================================================================================================== + +Using Gitea/Github issues on respective repositories already mentioned in :ref:`very first question ` depending on the affected environment. diff --git a/doc/source/training/helpcenter_training/faq/how_can_the_docs_be_deployed_to_pre-prod_and_prod_and_who_will_be_the_responsible_to_do_that.rst b/doc/source/training/helpcenter_training/faq/how_can_the_docs_be_deployed_to_pre-prod_and_prod_and_who_will_be_the_responsible_to_do_that.rst new file mode 100644 index 0000000..535afad --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_can_the_docs_be_deployed_to_pre-prod_and_prod_and_who_will_be_the_responsible_to_do_that.rst @@ -0,0 +1,17 @@ +============================================================================================= +How can the docs be deployed to PRE-PROD and PROD and who will be the responsible to do that? +============================================================================================= + +By triggering the label "**gate**" on gitea/github the respective automated jobs will trigger final merge and build of the documentation as well as publishing of the documentation to respective portal (PREPROD/PROD). Responsibility can be addressed to the labelling permission. Remaining stuff is in hands of zuul automation. + +**Prerequesits to set the Gate label:** + +1) Confirmity check of Zuul is successful (Zuul approval). +2) Approval of Pull Request by Squad member is there. + +If one of these prerequesits is not there. The Zuul pipeline will not start to work and to release the document. + +**How to set the Gate label:** + +.. image:: faq_images/gate_label.png + diff --git a/doc/source/training/helpcenter_training/faq/how_can_we_accept_the_entire_document_when_we_are_fine_with_it_who_will_have_the_right_to_make_a_doc_version_final.rst b/doc/source/training/helpcenter_training/faq/how_can_we_accept_the_entire_document_when_we_are_fine_with_it_who_will_have_the_right_to_make_a_doc_version_final.rst new file mode 100644 index 0000000..8f571a4 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_can_we_accept_the_entire_document_when_we_are_fine_with_it_who_will_have_the_right_to_make_a_doc_version_final.rst @@ -0,0 +1,7 @@ +==================================================================================================================== +How can we accept the entire document when we are fine with it? Who will have the right to make a doc version final? +==================================================================================================================== + +See the last option in :ref:`Previous question `. This can be delegated to different person chosen by squad. + +Also please refer for more details in :ref:`general change process flow `. diff --git a/doc/source/training/helpcenter_training/faq/how_can_we_revert_back_to_a_previous_document_version.rst b/doc/source/training/helpcenter_training/faq/how_can_we_revert_back_to_a_previous_document_version.rst new file mode 100644 index 0000000..31c4056 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_can_we_revert_back_to_a_previous_document_version.rst @@ -0,0 +1,13 @@ +.. _how_can_we_revert_back_to_a_previous_document_version: + +====================================================== +How can we revert back to a previous document version? +====================================================== + +In git the revert action would mean just another PR with changes pushing the document to previous state. To minimize revert situations both Gitea/Github represents 3 phases of validatiing the documentation. + +- Automated check jobs which validate syntax, conversion and build possibility of the documentation change. + +- Manual approval based on QA/UAT review. + +- Labelling the PR with **gate** label which is only applicable when previous 2 conditions are succefully completed. (Final Gate label will initiate auto-merge jobs and final publishing of the documentation change to Help Center portal). Squad can decide that this final "Go" can be triggered by different person only delegated for releasing activities. diff --git a/doc/source/training/helpcenter_training/faq/how_do_we_deal_with_open_tickets_may_we_have_a_separate_session_for_that.rst b/doc/source/training/helpcenter_training/faq/how_do_we_deal_with_open_tickets_may_we_have_a_separate_session_for_that.rst new file mode 100644 index 0000000..87ca986 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_do_we_deal_with_open_tickets_may_we_have_a_separate_session_for_that.rst @@ -0,0 +1,5 @@ +========================================================================== +How do we deal with Open tickets? May we have a separate session for that? +========================================================================== + +Requested squad has been asked to move all open tickets to Ecosystem squad. Most of them are already solved or obsolete. diff --git a/doc/source/training/helpcenter_training/faq/how_does_document_versioning_work_during_the_document_updates_how_will_the_final_accepted_document_be_versioned.rst b/doc/source/training/helpcenter_training/faq/how_does_document_versioning_work_during_the_document_updates_how_will_the_final_accepted_document_be_versioned.rst new file mode 100644 index 0000000..a484490 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_does_document_versioning_work_during_the_document_updates_how_will_the_final_accepted_document_be_versioned.rst @@ -0,0 +1,5 @@ +================================================================================================================= +How does document versioning work during the document updates? How will the final accepted document be versioned? +================================================================================================================= + +One document version can be understood as one documentation PR. Even if in PR there are mutliple commits based on several review iterations at the ends those commits are squashed to a single change and presented as one change in gitea/github. Of course in case of valid reasons Huawei might introduce more version updates directly in change history chapter. diff --git a/doc/source/training/helpcenter_training/faq/how_does_the_full_document_review_workflow_look_like_end-to-end_on_a_high_level.rst b/doc/source/training/helpcenter_training/faq/how_does_the_full_document_review_workflow_look_like_end-to-end_on_a_high_level.rst new file mode 100644 index 0000000..13df8f7 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_does_the_full_document_review_workflow_look_like_end-to-end_on_a_high_level.rst @@ -0,0 +1,8 @@ +================================================================================ +How does the full document review workflow look like end-to-end on a high level? +================================================================================ + +The process of the Documentation Change Process under the following link: `https://gitea.eco.tsi-dev.otc-service.com/docs/docsportal/wiki/Process `_ . + +The more detailed description of the QA/UAT document review under the following link: `https://gitea.eco.tsi-dev.otc-service.com/docs/docsportal/wiki/Review `_ . + diff --git a/doc/source/training/helpcenter_training/faq/how_the_document_traceability_to_dms_rms_in_jira_will_work_who_will_be_the_responsible_to_place_the_links_in_the_related_jira_ticket.rst b/doc/source/training/helpcenter_training/faq/how_the_document_traceability_to_dms_rms_in_jira_will_work_who_will_be_the_responsible_to_place_the_links_in_the_related_jira_ticket.rst new file mode 100644 index 0000000..15aa939 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_the_document_traceability_to_dms_rms_in_jira_will_work_who_will_be_the_responsible_to_place_the_links_in_the_related_jira_ticket.rst @@ -0,0 +1,8 @@ +====================================================================================================================================== +How the document traceability to DMs/RMs in JIRA will work, who will be the responsible to place the links in the related JIRA ticket? +====================================================================================================================================== + +Huawei is responsible to provide link to Gitea pull request in Huawei Documentation Delivery task in RM: + +.. figure:: /_static/images/jira_document_pr_link.png + diff --git a/doc/source/training/helpcenter_training/faq/how_to_accept_a_single_change_comment.rst b/doc/source/training/helpcenter_training/faq/how_to_accept_a_single_change_comment.rst new file mode 100644 index 0000000..6b4f3b1 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_to_accept_a_single_change_comment.rst @@ -0,0 +1,5 @@ +====================================== +How to accept a single change/comment? +====================================== + +By clicking on comments' resolve conversation. diff --git a/doc/source/training/helpcenter_training/faq/how_to_add_a_comment_for_a_text_or_an_image.rst b/doc/source/training/helpcenter_training/faq/how_to_add_a_comment_for_a_text_or_an_image.rst new file mode 100644 index 0000000..783b67f --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_to_add_a_comment_for_a_text_or_an_image.rst @@ -0,0 +1,14 @@ +============================================ +How to add a comment for a text or an image? +============================================ + +As natively in Gitea/Github any change in PR can be raised as a comment and can become subject for requesting the change + +Adding a comment is a simple thing as described on the following link: `https://gitea.eco.tsi-dev.otc-service.com/docs/docsportal/wiki/Review#adding_comments `_ . + +Adding a comment for an image is not possible directly as shown on previous link but can be done simply at Conversation tab. + +In case you want to comment something not being subject of change in PR but still valuable to comment (other part of documentation) you can raise an issue for that. + + + diff --git a/doc/source/training/helpcenter_training/faq/how_to_check_the_rendered_html_of_the_entire_document_in_the_browser_which_button_should_i_click_on.rst b/doc/source/training/helpcenter_training/faq/how_to_check_the_rendered_html_of_the_entire_document_in_the_browser_which_button_should_i_click_on.rst new file mode 100644 index 0000000..106e21c --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_to_check_the_rendered_html_of_the_entire_document_in_the_browser_which_button_should_i_click_on.rst @@ -0,0 +1,27 @@ +====================================================================================================== +How to check the rendered HTML of the entire document in the browser (which button should I click on)? +====================================================================================================== + +**1) Open the PullRequest (PR) of the prefered document.** + +.. image:: faq_images/html_preview_1.png + +**2) Click on the `Conversation` tab and scroll down.** + +.. image:: faq_images/html_preview_2.png + +**3) Choose the link on `build-otc-` or the `Details` link on the Zuul Check-Job.** + +.. image:: faq_images/html_preview_3.png + +**4) The Zuul dashboards opens. Under the `Artifacts` tab the `Docs preview site` can be chosen.** + +.. image:: faq_images/html_preview_4.png + +**5) Sometimes a file structure opens, choose the document type.** + +.. image:: faq_images/html_preview_5.png + +**6) The preview site opens.** + +.. image:: faq_images/html_preview_6.png diff --git a/doc/source/training/helpcenter_training/faq/how_to_compare_the_content_of_a_change_with_the_base_modified_text_images.rst b/doc/source/training/helpcenter_training/faq/how_to_compare_the_content_of_a_change_with_the_base_modified_text_images.rst new file mode 100644 index 0000000..bd4ca82 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_to_compare_the_content_of_a_change_with_the_base_modified_text_images.rst @@ -0,0 +1,21 @@ +============================================================================ +How to compare the content of a change with the base (modified text/images)? +============================================================================ + +Gitea/Github natively support the comparison between the diffs and new PR is nothing else just a diff to a base. Text comparison is represented as a "red" (removals) and "green" (addons) highlighted changes as shown on the image below: + +.. figure:: /_static/images/compare_text.png + +Image comparison offers multiple options how to visualize changes. The most common is side by side comparison: + +.. figure:: /_static/images/compare_images.png + +In case there are only added changes or the whole new files are added then the whole file is highlighted by green color: + +.. figure:: /_static/images/added_new_text.png + + +Gitea/Github also natively support comparison changes between the multiple commits inside of the single pull request. For example follow-up updates from Huawei based on QA/UAT reviews are represented as multiple commits in same PR. The differences between the commits can be seen by clicking on the respecitve commit which will show the differences against the previous commit. + +.. figure:: /_static/images/compare_commits.png + diff --git a/doc/source/training/helpcenter_training/faq/how_to_create_a_gitea_account.rst b/doc/source/training/helpcenter_training/faq/how_to_create_a_gitea_account.rst new file mode 100644 index 0000000..e85c234 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_to_create_a_gitea_account.rst @@ -0,0 +1,11 @@ +============================== +How to create a Gitea account? +============================== + +To create a Gitea account, just click on Log-In in the top-right corner of the landing page. Then simply Log-In with your OTC-LDAP credentials. +You should immediately be logged-in with your account. + +If you want to be able to use Single-Sign-On with your GitHub account, please write an E-Mail to the `Ecosystems Squad `_. + +Once your GitHub account has been approved, you can Log-In into Gitea using the button in the top-right corner. Select "Sign In With OpenID" and then click on the GitHub button. +In case the system asks you to link accounts because you have an LDAP and GitHub account registered, please choose "Link accounts" and Log-In once with your LDAP credentials. After that, you can always Log-In through GitHub. \ No newline at end of file diff --git a/doc/source/training/helpcenter_training/faq/how_to_reply_to_a_comment.rst b/doc/source/training/helpcenter_training/faq/how_to_reply_to_a_comment.rst new file mode 100644 index 0000000..8080802 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_to_reply_to_a_comment.rst @@ -0,0 +1,5 @@ +========================== +How to reply to a comment? +========================== + +Natively supported by Gitea/Github by clicking on comments options and choosing quote reply diff --git a/doc/source/training/helpcenter_training/faq/how_to_request_a_document_modification_for_a_single_comment_or_for_multiple_comments__how_to_notify_huawei_that_we_need_an_update.rst b/doc/source/training/helpcenter_training/faq/how_to_request_a_document_modification_for_a_single_comment_or_for_multiple_comments__how_to_notify_huawei_that_we_need_an_update.rst new file mode 100644 index 0000000..a8635d8 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_to_request_a_document_modification_for_a_single_comment_or_for_multiple_comments__how_to_notify_huawei_that_we_need_an_update.rst @@ -0,0 +1,14 @@ +==================================================================================================================================== +How to request a document modification (for a single comment or for multiple comments)? How to notify Huawei that we need an update? +==================================================================================================================================== + +After finishing the review (and raising the comments) you have 3 options how to close the review: + +- **Approve** - PR is being approved and can be moved to the next stage + +- **Comment** - Raising the comments without explicit requirement of any changes (not blocking approval) + +- **Request Changes** - Approval is not given and PR is blocked by requesting the changes which means that PR should receive another commit of changes and only after that PR will be ready for next review round + +You can see further details at following link: `https://gitea.eco.tsi-dev.otc-service.com/docs/docsportal/wiki/Review#finish_review `_ . + diff --git a/doc/source/training/helpcenter_training/faq/how_will_we_be_notified_once_huawei_will_upload_something_new_doc_or_modify_existing_doc.rst b/doc/source/training/helpcenter_training/faq/how_will_we_be_notified_once_huawei_will_upload_something_new_doc_or_modify_existing_doc.rst new file mode 100644 index 0000000..a6b3d64 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/how_will_we_be_notified_once_huawei_will_upload_something_new_doc_or_modify_existing_doc.rst @@ -0,0 +1,5 @@ +============================================================================================ +How will we be notified once Huawei will upload something (new doc, or modify existing doc)? +============================================================================================ + +After Huawei creates new PR in doc-exports repository the respective PR is autocreated in target document repository (for example in https://gitea.eco.tsi-dev.otc-service.com/docs/resource-template-service/pulls). QA/UAT or other squad members are also members of the gitea group with defined ownership of the respective service document repositories and they should be automatically notified by email about newly created PRs. diff --git a/doc/source/training/helpcenter_training/faq/index.rst b/doc/source/training/helpcenter_training/faq/index.rst new file mode 100644 index 0000000..cbd0d07 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/index.rst @@ -0,0 +1,30 @@ +========================== +Frequently Asked Questions +========================== + +.. toctree:: + :maxdepth: 1 + + how_do_we_deal_with_open_tickets_may_we_have_a_separate_session_for_that + where_and_how_can_i_access_the_system_to_check_all_squad_components_and_the_current_tasks_or_raised_issues + are_there_any_plans_to_move_other_documents_cdr_hld_and_pd_as_well_to_the_platform_so_we_can_handle_all_documents_in_one_place + how_does_the_full_document_review_workflow_look_like_end-to-end_on_a_high_level + which_stages_or_steps_should_be_performed_in_gitea_and_which_ones_in_github + what_are_the_exact_locations_where_the_documents_can_be_found_per_each_component + who_and_how_will_request_a_new_document_update_when_for_instance_a_new_feature_is_planned_to_be_released_for_a_component_in_the_next_delivery_cycle + how_will_we_be_notified_once_huawei_will_upload_something_new_doc_or_modify_existing_doc + how_the_document_traceability_to_dms_rms_in_jira_will_work_who_will_be_the_responsible_to_place_the_links_in_the_related_jira_ticket + how_to_check_the_rendered_html_of_the_entire_document_in_the_browser_which_button_should_i_click_on + how_to_compare_the_content_of_a_change_with_the_base_modified_text_images + how_to_add_a_comment_for_a_text_or_an_image + how_to_reply_to_a_comment + how_to_accept_a_single_change_comment + how_to_request_a_document_modification_for_a_single_comment_or_for_multiple_comments__how_to_notify_huawei_that_we_need_an_update + how_does_document_versioning_work_during_the_document_updates_how_will_the_final_accepted_document_be_versioned + how_can_we_revert_back_to_a_previous_document_version + how_can_we_accept_the_entire_document_when_we_are_fine_with_it_who_will_have_the_right_to_make_a_doc_version_final + how_can_the_docs_be_deployed_to_pre-prod_and_prod_and_who_will_be_the_responsible_to_do_that + how_and_where_should_we_submit_bugs_when_a_non-conformity_is_found_in_an_already_released_deployed_prod_documentation + how_and_where_should_we_submit_bugs_when_the_documentation_url_is_wrong_the_link_is_not_working + where_should_we_check_whether_a_document_related_bug_exists_or_not_for_our_components_if_a_customer_opens_it_or_it_is_coming_from_another_squad + how_to_create_a_gitea_account \ No newline at end of file diff --git a/doc/source/training/helpcenter_training/faq/what_are_the_exact_locations_where_the_documents_can_be_found_per_each_component.rst b/doc/source/training/helpcenter_training/faq/what_are_the_exact_locations_where_the_documents_can_be_found_per_each_component.rst new file mode 100644 index 0000000..6d60595 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/what_are_the_exact_locations_where_the_documents_can_be_found_per_each_component.rst @@ -0,0 +1,11 @@ +================================================================================= +What are the exact locations where the documents can be found per each component? +================================================================================= + +**PreProd-Environment (Gitea):** + +* https://gitea.eco.tsi-dev.otc-service.com/org/docs/teams/docs-orchestration-rw/repositories + +**Prod-Environment (GitHub):** + +* https://github.com/orgs/opentelekomcloud-docs/teams/docs-orchestration-rw/repositories diff --git a/doc/source/training/helpcenter_training/faq/where_and_how_can_i_access_the_system_to_check_all_squad_components_and_the_current_tasks_or_raised_issues.rst b/doc/source/training/helpcenter_training/faq/where_and_how_can_i_access_the_system_to_check_all_squad_components_and_the_current_tasks_or_raised_issues.rst new file mode 100644 index 0000000..cd2f6c1 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/where_and_how_can_i_access_the_system_to_check_all_squad_components_and_the_current_tasks_or_raised_issues.rst @@ -0,0 +1,33 @@ +.. _where_and_how_can_i_access_the_system_to_check_all_squad_components_and_the_current_tasks_or_raised_issues: + +=========================================================================================================== +Where and how can I access the system to check all squad components and the current tasks or raised issues? +=========================================================================================================== + +There are multiple places based on source of the review task. All Gitea links are related to Huawei changes and changes being introduced on the PREPROD docportal: + +- https://gitea.eco.tsi-dev.otc-service.com/docs/doc-exports/pulls - general place where Huawei is introducing new PRs with the documents imports in HTML file (this is meta repository and not the final repository for a review of the change) + +- https://gitea.eco.tsi-dev.otc-service.com/docs/TARGET-SERVICE-NAME/pulls - this is place where Huawei's PRs are converted to service RST PRs which are ready for a review by QA/UAT for example https://gitea.eco.tsi-dev.otc-service.com/docs/resource-template-service/pulls + +- https://gitea.eco.tsi-dev.otc-service.com/org/docs/teams/docs-orchestration-rw/repositories - good starting point for seeing all service doc repositories of the whole squad for PREPROD documentation + +- https://github.com/opentelekomcloud-docs/TARGET-SERVICE-NAME/pulls - this is place where PRs are being created for changes coming from gitea after approval or from external changes (customer/TSI..) for example https://github.com/opentelekomcloud-docs/resource-template-service/pulls + +- https://github.com/orgs/opentelekomcloud-docs/teams/docs-orchestration-rw/repositories - good starting point for seeing all service doc repositories of the whole squad for PROD documentation + + +.. note:: + + In future we plan to implement also some monitoring dashboard to have all different PRs under one roof + + +There are multiple places based on source of the issue. All Gitea links are related to issues addressed to Huawei or Ecosystem squad and issues related to PREPROD doc portal + +- https://gitea.eco.tsi-dev.otc-service.com/docs/docsportal/issues - general PREPROD docsportal issues + +- https://gitea.eco.tsi-dev.otc-service.com/docs/TARGET-SERVICE-NAME/issues - this is place for service based issue towards Huawei or Ecosystem squad for PREPROD for example https://gitea.eco.tsi-dev.otc-service.com/docs/resource-template-service/issues + +- https://github.com/opentelekomcloud-docs/docsportal/issues - general PROD docsportal issues (also customers can raise the issues here) + +- https://github.com/opentelekomcloud-docs/TARGET-SERVICE-NAME/issues - this is place for service based issue towards TSI (also customers can raise issues here) for PROD for example https://github.com/opentelekomcloud-docs/resource-management-service/issues diff --git a/doc/source/training/helpcenter_training/faq/where_should_we_check_whether_a_document_related_bug_exists_or_not_for_our_components_if_a_customer_opens_it_or_it_is_coming_from_another_squad.rst b/doc/source/training/helpcenter_training/faq/where_should_we_check_whether_a_document_related_bug_exists_or_not_for_our_components_if_a_customer_opens_it_or_it_is_coming_from_another_squad.rst new file mode 100644 index 0000000..b179415 --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/where_should_we_check_whether_a_document_related_bug_exists_or_not_for_our_components_if_a_customer_opens_it_or_it_is_coming_from_another_squad.rst @@ -0,0 +1,6 @@ +================================================================================================================================================= +Where should we check whether a document related bug exists or not for our components, if a customer opens it or it is coming from another squad? +================================================================================================================================================= + +looking at Gitea/Github issues on respective repositories already mentioned in :ref:`very first question `. Eventually in future we would like to introduce some dashboard for monitoring and visualization such issues based on the service/squad. +Q23: diff --git a/doc/source/training/helpcenter_training/faq/which_stages_or_steps_should_be_performed_in_gitea_and_which_ones_in_github.rst b/doc/source/training/helpcenter_training/faq/which_stages_or_steps_should_be_performed_in_gitea_and_which_ones_in_github.rst new file mode 100644 index 0000000..fdfbedd --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/which_stages_or_steps_should_be_performed_in_gitea_and_which_ones_in_github.rst @@ -0,0 +1,8 @@ +============================================================================ +Which stages or steps should be performed in Gitea and which ones in GitHub? +============================================================================ + +Gitea represents PREPROD documentation and eventually internal documentation while Github represents +PROD documentation and eventually external documentation (from TSI or customers). + +More comparison details can be found: in :ref:`Differences between Gitea and Github stages `. diff --git a/doc/source/training/helpcenter_training/faq/who_and_how_will_request_a_new_document_update_when_for_instance_a_new_feature_is_planned_to_be_released_for_a_component_in_the_next_delivery_cycle.rst b/doc/source/training/helpcenter_training/faq/who_and_how_will_request_a_new_document_update_when_for_instance_a_new_feature_is_planned_to_be_released_for_a_component_in_the_next_delivery_cycle.rst new file mode 100644 index 0000000..c1ab3ca --- /dev/null +++ b/doc/source/training/helpcenter_training/faq/who_and_how_will_request_a_new_document_update_when_for_instance_a_new_feature_is_planned_to_be_released_for_a_component_in_the_next_delivery_cycle.rst @@ -0,0 +1,13 @@ +==================================================================================================================================================== +Who and how will request a new document update when for instance a new feature is planned to be released for a component in the next delivery cycle? +==================================================================================================================================================== + +Again multiple way how to request documentation update: + +- as a part of standard JIRA RM process there's task for Huawei (delivery documentation task) with mandatory field Documentation PR link which Huawei need to fill in with gitea link to be able to close the task. + +- as a new issue in https://gitea.eco.tsi-dev.otc-service.com/docs/doc-exports/issues + +- as a new issue in https://gitea.eco.tsi-dev.otc-service.com/docs/TARGET-SERVICE-NAME/issues + +- email contact to Huawei R&D from this link: https://confluence.tsi-dev.otc-service.com/display/HUAW/Documentation+Gitops+Rollout+and+Status+page diff --git a/doc/source/training/helpcenter_training/index.rst b/doc/source/training/helpcenter_training/index.rst new file mode 100644 index 0000000..ae0ad48 --- /dev/null +++ b/doc/source/training/helpcenter_training/index.rst @@ -0,0 +1,12 @@ +=================== +Helpcenter Training +=================== + +.. toctree:: + :maxdepth: 1 + + introduction + workflow + difference_gitea_github + contact + faq/index \ No newline at end of file diff --git a/doc/source/training/helpcenter_training/introduction.rst b/doc/source/training/helpcenter_training/introduction.rst new file mode 100644 index 0000000..4d255e4 --- /dev/null +++ b/doc/source/training/helpcenter_training/introduction.rst @@ -0,0 +1,50 @@ +============ +Introduction +============ + +The HelpCenter3.0 is Open Telekom Cloud product developed by Ecosystem squad introducing new approach in the documentation management. +In order to improve documentation exchange between delivery partners and the TSI a new documentation platform and processes +based on GitOps are introduced with following benefits: + +- Openness +- Transparency +- Comprehensive review capabilities +- Full control during the documentation lifecycle +- Documentation as a source code + +The target of the new platform is to store and maintain all documents in Git. This provides benefits of precise identification of changes +and preventing undesired versions to be published. The process is heavily streamlined. Once, those changes are approved and merged, +the connected pipelines ensure that the documentation is published fully automated which eases the document auditing and enables users +to see the complete history of changes tracked by git itself. + +.. figure:: /_static/images/helpcenter_gitops.png + +Solution is completely open source based and HLD is described at https://docs.otc-service.com/system-config/docsportal.html +Implementation is based on: + +- Restructured Text (RST) as source documentation format +- Gitea/Github as a repository +- Zuul as a CI/CD engine for workflows +- Sphinx as a documentation rendering framework (HTML/PDF/... +- OpenSearch as a search engine +- Swift object storage as a storage for documentation +- Pandoc as a documentation converter +- OTC Infrastructure (ECS, CCE, ELB, ...) + +HC3.0 comes with the following features: + +- Support of UMN, API, DEV and other public facing documents +- PROD and PREPROD documentation portal: + + - docs.otc.t-systems.com + - docs-int.otc.service.com + +- Support of all old HC links redirections +- Search functionality +- Mobile-ready UI layout +- Report issue functionality directly on any page +- Suggest documentation fix functionality +- Consolidation of extra content like blueprints, tools, and libraries for developers +- One repository represents one cloud service +- Each squad can control and manage their documentation independently +- Automatization and check jobs across whole documentation lifecycle (from import to release) diff --git a/doc/source/training/helpcenter_training/training_images/helpcenter_3.0_process.drawio.png b/doc/source/training/helpcenter_training/training_images/helpcenter_3.0_process.drawio.png new file mode 100644 index 0000000..10e5a3b Binary files /dev/null and b/doc/source/training/helpcenter_training/training_images/helpcenter_3.0_process.drawio.png differ diff --git a/doc/source/training/helpcenter_training/training_images/helpcenter_3.0_process_1.3.xml b/doc/source/training/helpcenter_training/training_images/helpcenter_3.0_process_1.3.xml new file mode 100644 index 0000000..7120b83 --- /dev/null +++ b/doc/source/training/helpcenter_training/training_images/helpcenter_3.0_process_1.3.xml @@ -0,0 +1,2 @@ + +7T3ZcuKwsl+TqnMfhvK+PBJMEs+NnQTIAi+3wDhgYjAHQwx8/ZW8gJEEmMVLMk7VjPEmyd2t3tTdumFr4+X9rDsdak7ftG8Yqr+8YZUbhmE5QQAHeGUVXOEoJrgwmFn94BK9vdC01mZ4kQqvLqy+6e48OHcce25Ndy8azmRiGvOda93ZzPF2H/t07N1ep91B2CO1vdA0uraJPfZu9efD4KrEiNvrD6Y1GEY904Ic3Bl3o4fDht1ht+94sUts/YatzRxnHvwaL2umDYEXwWVQW6+/zUYdAJb9v4+/D19tmfkTNHZ3yiubT5iZk/mVm+bDb5uvIoCZfQC/8NSZzYfOwJl07fr26u3MWUz6JmyWAmfbZx4dZwou0uDiyJzPVyExdBdzB1wazsd2eNdcWvMP+HqFl4TwvO2fy7IUnivLsH3/ZBWdTOazVfhqdNqO39u+5p+tYo08mzNrbM7NWXgtIVRD6EOwxCgqhPG96YAmZyvwwMy0u3Pre5fsuiH1DjbPbV59dizQL0OFM+0PoMmKRG3/wmkWTjtaZOSKJPHo7agD11nMDDNsc0sF4EdskNtLPm2Q6cTm18r8f1urpwe6+jx0F2/364eQtKjvrr0IPxCjm12q8IbW3GxOuwa86wGA7lKAO585X2bNsZ2Z/zYr+H/gzqczmYeEw8jheey54OPhdcu2o+sTZwLpst91h/4Atj1EU549Gd/f5mxuLg9iPMIcwMoOrngxPPe2DEeO+OEwxmzY6CKJTHZQeCq+6HJal9P6OJkw2LQmEI5tA1ltHp/TXXcaCPBPawnJKD6VWS46DxtOKtPOmI2csANiWRIrPDYfOR6fjtG1q89GlsA9BXsegmQH3MJ/F05044/rA68KHmCY6dKn7eg++DWAx9dpvzs3wRPOJ2zHnH1bBjztO8ZiDEALaNeZRL2BwQcdBu8eYOH0cXTHGTVDYun1qnAbsPQYqwbX76RavVZLkyGL8i5DlgWcIbMMgSHTVFoMmTsuQMHXzY9IyZ7Eczwq/cD1T8kwDQPOQdsaTMA1A4AUcsRbCDULaMPV8MbY6vftfZN5VwLEEExzqaJLkqMZGiJM4PEZKxPwxaSFLv4q6BJqUv32DkeXUq1Ld7VfhC45Z3QJGLoeFl3PtE5GWqhJEpTLlFDls87UUCWIPDazCLqpQFBN00KVuFcWutPu5CRZSEFs/gkRA+/Y5mcg16he1/ga+ED/YwR4hPetiTW3ujZRkNacCQQseOqhpT36XgrwX6PZignPYIR7hGcCfiAxPZYkEfu8KfW5lPkBGLxhTQbglM9S9tIilbvwlTCa6ywW8K2aCtqrKSmwiZAUfwSTEFB+LrI4yuhMuYR8FfnbF+Ue5iyA6tKnKfwmdUkUc5a/NO4figQw5f530e2D49gc9wCMy6mGoI7AHXmSryg13NEY7s6Za7Igsl2CbDNpIN3E3zPXJDrvuYY7bZrhFHueOf2FAXWYJ29SzjUcdbnPNdwT9AIVU9gqQ71WgbJJzcxvy/RK7OFGZu7Yw504zbjDzfVNjK0F8enMxt05hkd32J3Cn5GL7rirrRfg4LH3A3xvrIDo/zRHUCbFTPV/OoE7x5z0q3C5GUoou+u6lrGLg8PzILb0cbNd+IgWRciLHvgK1bnLIKchM1hTOACukEnNu7OBOT/wXCTijq6hxDBPmrDRtUuXWnhx1/NPMyJCUXuWU/CmBIlGqBjlKgF0UluZofe7tChl17dPNVfu3Byfo7T1hZ7AE9jE5+cnk6aBtFnZjZaDqB0HBQ2n1NCZWWvwCPTX+Pf71sw0/E9mFbPr+p4h25o+RM2D32/hb7vbM+1nx7XCx7HhPyIPbD5j5oRgZZU/tHT65DqBU3LMDomRFiloirBMxaYn4HDn3PPCHUbOMCDPvsx+IOv+mMupM5u7/rSdQkA64QDOjgkosFDDnFoswY2asVOLxr1aBKn2wxdvaVrIffWWTuCMukx7IGgCoT4RaRDtHd2CrE9cWQsomnRnRUQkc/y50p0VUR1VkjKV7hFsYyRV9/kpuDYxvZsDK/lxC+Nf4baA3SbltmkxASaBlywtJgAmvq8wxeKpDjICUlRUQUyLyIQ+altEqkhhuI+A2hZncx/MtmC5ZLYFIK7uKvbYFD7gHhg0g/I5RqKOjI09/Ab4EYziuuwQd2nWZmYQ1/S88N9vmP9dmO48UkUXUzBdzO64wMro1pxJ10ONMkuesHiXsWrKkALeSn/LMV533OHCFIspChJfiQZ/uVZGao1Hl1HSVswSxOmVlIsR5FHKLZg0D0htG5TN7lKdgHpYTqThvQ2LKMtNm5yvE8eYa9xSZpJTZBmCq0GmKgRnA73xSlxfeOKeXxbnQT/Pr1MRd9fzaFbM3bXD4A5PI4gD9P2cfvQfaoAjmMhiNU9ibonzT+HrUrraJWb2kKL5Ml7NY3DH55Vl9CF5uyNtt8L3QIJSzn68xMplwWQ0hxqvonCuVObQNWkRVVLTlsMkF3Ihop73mtm9ruvzQN/niPoguxN4a+ujnK+m5k3omoQCx/oEEy7+QNwsL8OpLzHvZSZ38z7SjnPyhfrg3zJTmMp/qjf0yiyWScxj5WLxWDB9KlTMXEGEvYSS0Pk+ThlNdb6ej1NGBs2f6uNE3kjHx8mSFhCKIAJuF5YNufk0CDwkcPvPmTPeaMMBCUBX68KF/I+hmtOhNVn+Pr4uZsnXGUrKn6/jbvgr8/Xf5fySE3L9aOoXhesD5lyJVSRAnFQMjfpcT+H6BxuWs9W6WXwZ4nkvkytI4G6epn4CjkQ29VMLSGNTd8cf5C4Uxe2a+6zMHeEx5xj8fgsplzHJyWiHzKQinMlOWIGvABG4+aMFrO1s+UnqYeSXFi0qChlEUQMbVDHnWxKIts6wtFhhE+H9dFsCG/aptgT6Rkq2BL5E0Ngn14xhdzIwXYxQC1y+IpgKGz1cyFQPZwvgX8FXJarT6cyB8/EOQ+Rs6Ix7C/e4MnIAvwjsk9Rci9NBetihEOwIpOUHjoQdNKbqethJffnhJ1lJx42fgi0rAI5dQewS7nyDh9CYlLFSgi8tNLqW63toDGcMRcEmL/Hemj8seuDHc+MmcuLPAo+/W4qKo/k6GDPKX1REXK5cC03GjArmf+dRm1sQqQrDYdUiT1/x5IHpRMubPxHpRpArApcpn+Jw//fqDF6TT4GufQpReooPjRCGSEg4EwiUmZorJspWjOFv4pToI6OPR9EnMHikU7bow32hACK2acwRRSFa2p/04RlcTroJY+43+ceoienNLIgyhvqPM4WXuvb/nKNDKKJ8SzIz7u7qQu236hBcAUqTJnCyRq5va+yX29+bR91z5nNnTEDNHNaQvvVfr0YhiRAPWHxi2IPS7867N2w1OGXu3O/BDXO7BNTD1J4fdKazuuV678uFsaas7kODMhTn+5Hts/0Vz2or/tsYG9/aqOppNXndHxuWej+0u+99p69Qlj56sdSH/rTz0HCemyqj1zhPU+rgWmfa+ejXeuxAVkfVgVarRvcG8J1nqz0y7+uiWqtKxv0d1a3dfj03/+rgOa+ptKlH2N9bm9HrAzBR1EFnbLs90F9vLC86TfU7/n7v3l503u9WDfbvsHNv273Jy8BsqaBfOLbBCoxrrVnRuL6WcJxaTR2YSh08o638vpqgj837jbfe2KYgPPofum3Yutd+1+0Ww78ZzCv4fhuOdaiOuLHKDodPq+rA/w628W3Ubtf9e9l7fNe/e/cy7HvUUV5X7dGdrVq3351xZ935eIEwsR7XCd8fVen2+nW5+3595/3nmux233m7DeaTMb776r6/LfpK7P7D7bB/Pxh0AAxbreibPaq9opZ6k6P0keFqNWr1VPOWAFa0tn6da6OBC/4tNKW6aq+rrtZ6XehKm35U6uD917W28lb6iuOfWv5z4J+xeGp9ceq9Frb5Ctr0QJtU2KYGn5tHz72s2144jqUe9OnqigH7YB6VF1dbq7Dv5aPSEMC3j7vvSxfAnTbGnqMy9alqSYCO/1o61265A1VZeu2PhqPeA+h8UWtAbUttpIJW6qC3NtNT6qC3r/VTE4ymxvHa2h6BI/2ktOfh0d2cN4PRQIg8juor7UOLrs03z1iw/Tp45lbbjlTlH0cDSn/1VuB5T4f3WwPQf5UBkAb9v3K6tTkuIRT15uY86KvpecbmWS98xvNA2wutpS633+FF36EFfbwA6Kl+XwAD+DdF55tv84L+3jXQnsoDbMaObVb3jy9+PwBLDByXf/4BnzcYgFVwrFPBGOF9Krj/Ho3ra661qmA8Gt9SXkBfXxTAKg2eBVRzC2G/Avfc2JHR15AKOQq8627g55/ba3BkdUg9AK968B3w/jy8nxyXrx5GG3EOo1Kd+wY47wEJ22ZsymxRFhj7dratvW+D7UyeoYCLWHwqig+XwEYm+etSc9dxCRZtSumWgXSLSapXwO0XBtMB30HJ6pge9h/+Ts3aYKkq/rgA/YP5BuajrrxwKuDEuqIBHlXntdYL4KC3vDYC9N0C81jRAH/5Wuq+ZITS0wA87ZXWwPjMFpCUFnz3K3oXSlYoSVnwLbTeUgHc+qv2e2Paeeept3d+2n/QZHXyBqTV3xaAwajzKi/egDQFfU6hFGo31Tj3rkEZpVM9tjoI5CA96jHLbwOMTl2/WU/37RUcSY/pjA3mjQI9ewj/P9hC32q3BhToM97CEpUg3jcB2gdkTKBVqK5e17YSrOYBPldnQr4NfofS9kuD/A/A64UBvHX7/KgaSbrVo9L2XtabZ4CW4v+jgJT1JXT8WbWuwXuo5GRjknN1luSEMj34KkAR+rsvSdZPCuTQFML5KZTzz4F05WPSBz+i3NiXPnGp6CFS0dtIxZ5iBFJuFUg9XXmF51DqQolIAR1lobcGzGMrPCp1cNTo7XkgQR/BzNg8q6DH8B0FQNqX8HfuY0uF0hdgBlK7wUK9SVtxax2MBeoy2noAob5+hPqOP2sg5OH5Wwz66qjx1bmHOhycDd2PBg+uLZ5is/054EAAM+n5QoTdWFJG5GhS4pBIkCpiWlIFX4HFpEoai0AbB2nCRaDTkHK8MlzSkLfI3VAUR+umevVmmffs0nBYU1zGpeE4Ugn9lGhPvIT4NgsF9GkLBVcmWi5pPcOC0SyDxpaIqJKcmGaxpoSEq5Qnh9DQaKoVRWcQEMNJ+AzIdL+zC9bp03FWH58VSVk5f/Wd0i6TvPh6dxgcA/3VnxZA4A2shG3CYtgEZ3Xg3/YLRN4Q0t48az6M3d+mS9z7eXLb5InT/dt5VUhH/NupboWIhVNJhOq/Gfu3I35cBuwk0/GSJrNxBVtMFxiqwu7daRFX05JH+TCo9EyYzHYt8cbja+cFyh1z93JZgFGz6/qZwRCe/p0H056GDI6h2AoFmXCj/tx4AmOn6vqb2njStbr+C7ddyjSfDE6E3BkvHjKwlcvKP11VIwGyMq6qwades61MtbmquENTbViKkyuisI35QormnJZ2szd2zO+GRvxqacs/UoRCEeTf/vIZu/G2yQpjRMbHfAjvxNOtfcvG9i0cf5/f4IkdoVv5fQIz08IaLJ04Eie9XWd5nOX+226M1LdnhxH8HItQwiXJanhraNmitJll6ksT1wsJP40YjruquGKJ6L3ZgRnaoCd7cCU0Jv1oEiT6Bhttfpeqz5fHl0E0cwbDKSjDmXzaFgxtvWq+3NZgODNfLnXDAs2XY1nSpjLZ5svxeL5c07G/IZ7GGL6AEmbOPdP0M1cX7hyMAfoJbvz98Vow+mJTmCxwMWyCkqPLV8x/LYjDFo5gMOv2LXOHyjapVdE9JbY/kme68zQJDY1jZlnSPqYZa0+p79NSyDX5aH4VRegxHACPcENmQxen/7MsSj9XS/7n9vSUqhSLAFOq/MkJPqmWdw3bgGK3jhAJoY+LzIRDDWdsMQik5YWCWgzpJ5EWhY+iuaEsx8Lc0DNJDhPYWUctCfhiQJn/uVe/4tndYuksK+WcQCgkWB8o4+jLLLEyS6zMEiuzxMossYPSTcCkW85ZYgJphTED30ExY8SKogJDOyny6kSkwl9kdaGtCWhraWvBeDZiWUVjH5sQJKmCMApOlgipP9kqwgnW10pFuEwoLRNKy4TSMqG0kAmljIzLFYICmmVCqUBK6itCiNveEO8wWC3a/WMT8g00iJlfcdLfrNkcT51Zd2bZEAF+GammZ3360eC9ERRl8MPAEwOzDGA7kYrRADZSbdyMl2AFfK1/u+MCOSFrj+b0jwR+E+sZZxv4LaS+bF6s3byKYtwyolQRYmHWiGWKbRV8/pK5xKZXMV9Ehy1EfSUfH/ZOOkvt0bTKg9Bz3P9TKFgBbk6kKgKy4i1wQCOLTYWzUxfQnAgh46V0EV9KrzmT/sJXdPxthWwfhFAkHormvymrcZ+q/hQg01jE17ijjRtK/F6IX64ACY0iYSP2fztYK6P8jH1byrHy2SVRDje8pbasBAe+AEUoa1EWs0BYUKbFLLjkKWLpsaAyRSwPFgS0332c4rJssUMNZ627CiVp5UBa0R6XEdqZS2LlsdaShstfjYgK78dGS5Psk63+f0ACQ+MtuGJOvq2ZM/Hdkr/OV51pdRKuALvsiSRfdemhFoTjmMrYQy3m6KH+/XkGaB0R7vxkLdT9hjeVsvSJwgiJFYd22Df1n3swZ7tnbVTTF3oCT5idn5+fjGGkx7E5MToPh0vtcHAaUvLQmVlr8AiUgv79fiwrFEg8X97a1vQhah78fgt/20h0EjZ8NHxp8xkzJ2SWrPKHllINCkOy9GiWsDEWTfEE/iSiyfdXY1AS7vV93qs4+JT3sOiVpPfjSI/ZIT1ixRgy6WE6/fVID3c5P5oDE+q4J1LX8aDSUP2+gKL6XXfon2y732fFyVez4i7QU0nL8yIl4xhOLdpUwtOuOFz5scFUcvehIIZxLHw0BnGWQ6Y4nRqYJU6oiLsRWDTPi6Si/oTZxPFpwRr3wRK8bb8A1iKTP6zxQHiC++nnwXoX0DKXP6DxcHXx1wEaiN38AY17vghV6X86oNkCUDTuuJF/PKBFfpdHM0IBAI37XaKiYz8Y0hy/q+MxPCn2O0s4y7j5SBM8XD8MzhjrkITcKVrGrSWaENPxwyHNUvnrdzJut9C/wnDZhTRNKHuWLZxxm4X++UYLKg1ZLn/9TsYtFvrnmywcmpHAivlDGjdZ6N9ns7BCAbg0brPQv89oYYugeeBGC/3zrRYU0lwRNA/camF+vtWCQZqWcof0ZsU/Dmr694E6W59HY7Tiu4Il6C90Z+p1FkOv+v4nQerS7krLqbWtBf/vhDUYwjrRkdWdPYhIisMDCEMsep6glMs0IYSCRWuoXw1fhFmQaSilJNzE41ZgidGj2+KcUVZofzYmEt1yPpIvTrOUmIq0d1c4GtYCkCQevX0k9OWMeBUimRDcEdmyzzRmI8I+5UwFFRHMpNKVQfwmBMkOuI/FpKJhpa/TfrADkJ/Jt93XB4kh3ESLBh3uiRbdcgD6OLqPBhHWq8ItKYjwTqrVa7U0GTK6jC5nmjdBJIEE9d0ShN/0JB66j/HwG8kw0wy/oblU0SWh9VAEHp+xMgFf1ygkQUQX7ms5B1155Ulmji45Z3ThDptoX44TkZZPZbvUUCWIPDazCLqpQFBN00JVUfMzas4EIgA89dDSHm/CYkH+Jni/LdWCT5U7ILKXFjPNmybSHO776iwW8K2aCtqrKSmwictjFbNjEgLKz0WWEFiaKZdIkHGRJFpZlHukjbA+P03hN6lLopiz/KVx/9BmYyz3v4tu/wZutjXuwc20yqm2izoCdySl26Q21Wg8+uOcuZZXQn7Wc02i855ruNOmGU6xMOkD/HryJv7GdQ78r2/a5qA797enm5ozl5BU+I/PQonGZyEnbp7KZh7iXqIXqLTe+HsPvlbhbsxB4aaSi+IGaO5cFHfwNOPOOL8i6WRrXfi1uOYYHrNI7s3PL4eWn6RpQs2s9JJ7yYhL4Or5+TsvnLkcslve75BvO743MnmpgSLTRla7mYlIagQjIhSVvMxFlG65oeL0tjIjk+x+dxelIFUkmit3bo7PUejySvXcrPr+mFTPFDglkupJWsAgZnqy6Qk43HH3vHCHkaMMyLOvaP/jP+Zy6sz83ZRn5hQC0gkHcHa8QIGFGubwImWEZ+zwonGPF0Gq/fCFXZrONKyODOgfVRokPS0gZ+nOiohI5tBaC2fXEKHpKN8rI+nO4L60us9PwbWJX8h+7yp/3ML4V7gtzeIsYA+3TYsJMAk8aClWsPcVplis1UFGQIqYKqppEdnUR22LQDfJj/sIqG1xNvfBbAsWLTB7vcr5yERiwuJFievmo2/cXFo1nzy5cHdnbWYGMU9YwVlfFV1MwXQxu+MCK6NbcyZd7zXKLHnCwl7GqimTYB/n0t9y3A477nBhcmWKgsRXorFerpWRWuPTKy1KBug/sUdrijYCk5By85XmAantq4pMC6iH5UQa3tswtuFO2uR8nRjHXGOaMpOcIssQXA0yRVohpDdeiesLT9zzm3H5gHT8OliFqGyzgMmwxh2eRhAj6Ps5/cjAn1OqN5XVPMTsIUX6Zbyax+COzyvL6EPydkfaboXvgeSlovnxEiuX+cpotGYwLZ5dMxgtQICXaU1bDpNcyIWIiN5rZve6rs8DfZ8jeU+YrY9yvpqaN6FrEgoc6xNMuPgDcbO8DLW+xLyXM61qTw4HSJAp/BN280zRDErMY+VceSyYPhUqZq4gwl46f9sZ1Mcpo2nQae0OSkeZ2sl9nMgb6fg4WdICQhFEgL9pCST+zSbKKLf/nDnjjTYckAB0tS5cyP+2u3P/Nr4uZsnXGUrKn6/jbvgr8/Vf7vySE3J9dg9xZOb9oiuxagWIkwrfuv0Urn+wYTlbrZvFlyGe9zK5ggTu5mnqJ+BIZFM/tYA0NnV3/EHuQlHcrrnPytwRHnOOwe+3kHKJk5yMdshMKsKZ7IQV+AoQgZs/Gi3InfHiEJt6GPm+gkY/jQyiqIENqphLNrDcbYqlxQqbCO+n2xLYsE+1JdA3UrIl8CWCxj659pu2AM9CD89210AyevFViWiLd/YOQ+Rs6Ix7C/e4MnIAvwl3Nt1HB+lhh0KwQ9osZ5NgtIMdNKbqethJffnhR1tJx42fnJf+4Qa2u0TFnW/wEBqTMlZK8KWFRtdyfQ+N4YyhKNjkJfrbEYLjc+OG2exR6zv83VJSHE3XwXhR/pIiYnLlUuiZvChf9zuPmtyCSFUYDiskefqCJw8sJ1re/IlIN4JcEbhM2RSHu79XZ/CafGp3Hd4jMA29h0YIQyTkmwkEykzNExMlK8bwN3FK9JHRx6PoExg80Clb9OGuUAAR2zTmiJ4QrexP/H3s4WrSTRhyv0k/Ri1Mb2bN/UIk/3Gm8FLXPmvbWUWUb0lWxt1dXaj9Vh2CK0DV0gQ+1sjzbY27UBTvTaPuOfO5MyagZg7LS9/6r1ejiESIByw8MexB6Xfn3Ru2Gpwyd+734Ia5XQLqYWrPDzrTWd1yvfflwlhTVvehQRmK8/3I9tn+ime1Ff9tjI1vbVT1tJq87o8NS70f2t33vtNXKEsfvVjqQ3/aeWg4z02V0Wucpyl1cK0z7Xz0az12IKuj6kCrVaN7A/jOs9Uemfd1Ua1VJeP+jurWbr+em3918JzXVNrUI+zvrc3o9QGYKOqgM7bdHuivN5YXnab6HX+/d28vOu93qwb7d9i5t+3e5GVgtlTQLxzbYAXGtdasaFxfSzhOraYOTKUOntFWfl9N0Mfm/cZbb2xTEB79D902bN1rv+t2i+HfDOYVfL8NxzpUR9xYZYfDp1V14H8H2/g2arfr/r3sPb7r3717GfY96iivq/bozlat2+/OuLPufLxAmFiP64Tvj6p0e/263H2/vvP+c012u++83QaTxxjffXXf3xZ9JXb/4XbYvx8MOgCGrVb0zR7VXlFLvclR+shwtRq1eqp5SwArWlu/zrXRwAX/FppSXbXXVVdrvS50pU0/KnXw/utaW3krfcXxTy3/OfDPWDy1vjj1XgvbfAVteqBNKmxTg8/No+de1m0vHMdSD/p0dcWAfTCPyourrVXY9/JRaQjg28fd96UL4E4bY89RmfpUtSRAx38tnWu33IGqLL32R8NR7wF0vqg1oLalNlJBK3XQW5vpKXXQ29f6qQlGU+N4bW2PwJF+Utrz8OhuzpvBaCBEHkf1lfahRdfmm2cs2H4dPHOrbUeq8o+jAaW/eivwvKfD+60B6L/KAEiD/l853doclxCKenNzHvTV9Dxj86wXPuN5oO2F1lKX2+/wou/Qgj5eAPRUvy+AAfybovPNt3lBf+8aaE/lATZjxzar+8cXvx+AJQaOyz//gM8bDMAqONapYIzwPhXcf4/G9TXXWlUwHo1vKS+gry8KYJUGzwKquYWwX4F7buzI6GtIhRwF3nU38PPP7TU4sjqkHoBXPfgOeH8e3k+Oy1cPo404h1Gpzn0DnPeAhG0zNmW2KAuMfTvb1t63wXYmz1DARSw+FcWHS2Ajk9x1qXnruARrNqV0y0C6xSTVK+D2C4PpgO+gZHVMD/sPf6dmbbBUFX9cgP7BfAPzUVdeOBVwYl3RAI+q81rrBXDQW14bAfpugXmsaIC/fC11XzJC6WkAnvZKa2B8ZgtISgu++xW9CyUrlKQs+BZab6kAbv1V+70x7bzz1Ns7P+0/aLI6eQPS6m8LwGDUeZUXb0Cagj6nUAq1m2qce9egjNKpHlsdBHKQHvWY5bcBRqeu36yn+/YKjqTHdMYG80aBnj2E/x9soW+1WwMK9BlvYYlKEO+bAO0DMibQKlRXr2tbCVbzAJ+rMyHfBr9DafulQf4H4PXCAN66fX5UjSTd6lFpey/rzTNAS/H/UUDK+hI6/qxa1+A9VHKyMcm5OktyQpkefBWgCP3dlyTrJwVyaArh/BTK+edAuvIx6YMfUW7sS5+4VPQQqehtpGJPMQIptwqknq68wnModaFEpICOstBbA+axFR6VOjhq9PY8kKCPYGZsnlXQY/iOAiDtS/g797GlQukLMAOp3WCh3qStuLUOxgJ1GW09gFBfP0J9x581EPLw/C0GfXXU+OrcQx0OzobuR4MH1xZPsdn+HHAggJn0fCHCbigpI3I0KW9IJEgVMS2pgi/AYlIljTWgjYM04RrQzWX+0+OF4ZJGvAX+h9wcrZvC1ptV3rMrw2FNcRlXhuNI1fVToj3xEuLbLBTQpy0UpE20XNJyhvnSLIOGloiokpyYZrGmhISLlCdH0NBophVFZxAPwxH2E810K7QLlulPc1anOCuSsnL+0k3ULpO8+HK3b6bdEFLYPGsOaxfa4f1t6sPAz3nbJkJs1sJts+v62XAzE9bZRp3dFYzMClw0HfFrp7o7IhZFJRGK/mbs1474cBmnc6ZulzSHjct3EV1gqAq7d/NFXD1LHtzDoFIzYQ7btcQaX+iUMZewFIjyUQvC07/zYNrTkMExFFuhdpisOfm2Zs7Ez6ko08guYbxwIuTOePFQgS2qlX+6mEYCZGVcTINPvVRbmWFzVXGHZtiwFCdXRGEb64XUyjkt22ZvzJjfDY3409KWf6TIhCLIv/1VM6Iw24dFLzo7Wg8jslPmQ3gnnmUNfgAm5Yfy+lv/Bk+gJslvE5iZ1tNg6cQROOltRMvjLLd0X6S6YzsM3OdYhBIuyVHDW0OrFaXNLPElCc2cweVlynAmn7YFQ/2umj60VaTOTB9KXeFC04dYlrTHRrbpQzzuvm869jfE0xjDFxBO5twzTT+Rb+HOwRig/XTjbxfWgqvRmzpNgem1CdKMLl8zHdCQzN4nQTx0Tekzzc1i8ADNwazbt8wdKqvTgJ7E2D0ltl2MZ7rzNAkNjetkWdKWjxlLlZzy1Iq2Rsnnm2/GcAA8wg2ZDV2cDc2yKP1cLRea29NTqis5vFyqQhcTPJeezkSxWwNRQujjIvXpUMMZa1JRd2VSXZH8HmiuHMuxMFfuTJLDBHbWURwC7tsv8+H26lc8u1s7mmWlnBOqBNzJjSGvjCsus2bKrJkya6bMmimzZg5KNwGTbjlnzQjlBk4FVIGhnRQZyBGp8BdZXWhrAtpa2lowvsJXVhXYxyYESaogjIKTJUIqRLaKcJlgVwxFuEywKxPsygS7MsHu9FVSRsblCkEBzTLBTiAl2BUh9Gdv6GsYxBNthrAJhQUaxMwvwOfvXWuOp86sO7NsiAC/rE7Tsz79KNneCIoy+GHgiYFZBvZcGNhDKhWa8RKsgK/1bwvQkxJM9mpO/0hALLG8a7YBsUIeu8sVZ3Oj/NbJRakixMJPEcsU2zn1/CVziU2vgLiIDluI+ko+PuyddJbaBdKedBkRepG2QwxUjTzjuSsCsuItcEAji02Fs0O60VhxIeOl9Kj/nbKDk/7CV3T8XVZsH4RQJB6Kcr4pqxOfqv4UIANTxNe4ozr2JX4vxC9XgEQvkSAwymCtDOLW9+2wxcpnl4g43PCW2rISHPgCVCpp/mWO//kcKHnmTHociCs5UA4cCCi/+xjFZUk0hxrOWnUtk7LyIK1ox78I7cwlofJYa0mj5a9GRIV3Y6Mi8dOa+DYZqp77/5XlGq4tQwuw25hIclKXrmlBOI6pjF3TYo6u6d+fYIAWVuDOz9JC/W54U2nLHdy3u6faDvUff1Ows3bs6As9gSfMzs/PT8ZIMSGUE6PzcLjUDgenISUPnZm1Bo9A+eff78fSQYGs8yWtbU0foubB77fwt42EJWHDR+OWNp8xc0JmySp/aCnVaDAkPY9mCTsE0RRP4E+bvYSuzqCisFVElCyMUInAKe9h0StJ78eRHrNDesQSGmTSw7T565Ee7mt+NAcm1G5PpK7j0aSh4n0BRfW77tA/2Xa/z36Tj9pvGeippHV5kZJxDKcWZioxGH4Jvh8bTCV3HwpiGMfiRmMQZzlkitOpgVnihIq4G3pF87xIqm5OmE0cnxascecrwRnyC2AtMvnDGo+AF34DrHcBLXP5AxqPUxd/HaCB2M0f0LjPi1Ce+6cDmi0AReOOG0L1jB8GaJHf5dGMUABA436XqBLwD4Y0x+/qeAxPCvrOFM6414KmM4PzQa2cTtWe4pkKg5A9J+VO9jJuzdMEh+MPI3uMk0tC/pDGjVea/XWQZqn81W0ZNyPpX2FH7kKaJpSfyxbOuAlJ/3wbElVOWC5/dVvGDUj651uQHJoZwor5Qxq3IOnfZ0KyQgG4NG5C0r/PhmSLoHngNiT9841IFNJcETQP3Ihkfr4RiUGaLoDdgpuRTHZmZGaQTtUDBU5nDowY29y7B4Mdak7fhE/8Pw== \ No newline at end of file diff --git a/doc/source/training/helpcenter_training/workflow.rst b/doc/source/training/helpcenter_training/workflow.rst new file mode 100644 index 0000000..29598de --- /dev/null +++ b/doc/source/training/helpcenter_training/workflow.rst @@ -0,0 +1,45 @@ +.. _documentation_change_process: + +Documentation Change Process +============================ + +The following figure provides an overview about the current helpcenter 3.0 process. + + +.. image:: training_images/helpcenter_3.0_process.drawio.png + :target: training_images/helpcenter_3.0_process.drawio.png + :alt: helpcenter_3.0_process + + +Document Change Process Description +----------------------------------- + +**Huawei Documentation System** + +1) Huawei employee / Squad member receives Jira task to deliver new documentation from Huawei documentation system of specific service. +2) The documentation files are exported in HTML format and needs to be converted to RST sources which will be used by Sphinx rendering engine in a later step. + +**PreProduction Environment (Gitea)** + +3) Huawei employee / Squad member creates a fork from ``doc-exports`` repository. The new documentation needs to be pushed into the forked ``doc-exports`` repository. A detailed description can be found under the following link: `"How to fork a repository in Gitea and push changes." `_. +4) After the forked ``doc-exports`` repository has been updated, a Pull Request to the original / upstream ``doc-exports`` repository needs to be opened. +5) The automation pipeline tool Zuul CI/CD converts the delivered HTML files to reStructuredText (RST) sources with a self-written script. +6) These RST source files will be stored in single repositories located in Gitea, too. For this purpose, Zuul opens a Pull Request in the specific service repository, e.g. ``UMN`` (user manual) documentation for service ``CSS``. In the open Pull Request users are able to see differences between the old state of the documentation and the new one. Gitea provides suitable tools to comment specific lines of code, to approve a Pull Request or request changes. +7) Additionally, another pipeline in Zuul builds a preview documentation from the RST files included in the Pull Request which can be used in the review process. `Click on this reference to see how to open Pre-rendered documentation after opening an Pull Request. `_ +8) Assigned squad member(s) start the review process of the auto-created Pull Request and communicate their feedback. This can be done via the included functionality of Gitea to provide comments to a specific line of code. +9) If the Pull Request fullfils all expectations, the Pull Request will be approved by the reviewer via ``Approve`` selection and the steps 10 and 11 can be skipped. The Pull Request is marked as ``approved``. +10) The Pull Request does not fullfil all expectations and review comments needs to be added to the specific lines of code in the target repository of the specific service. After all comments are placed, the review of the Pull Request must be finished with ``Request changes`` selection instead of ``Approve``. +11) Huawei collects comments and prepare new documentation change or reply to the comments. They update existing Pull Request with new commit on docs/doc-exports repository. +12) After the review has been approved, the Squads product owner or a privileged squad member can add the ``gate`` label to the Pull Request. +13) The ``gate`` label triggers another CI/CD pipeline of Zuul to render the final PreProd documentation on https://docs-int.otc-service.com/. + +**Production Environment (GitHub)** + +14) Additionally, Zuul opens a Pull Request in GitHub in the specific service repository to update the Production documentation, too. +15) If there are merge conflicts, these needs to be solved (16). If no conflicts occure, skip to 17. +16) The review squad member must decide how to fix the conflict and what content is supposed to be changed. A discussion is necessary with the Product Owner if further judgement is needed. +17) If no merge conflicts are present, Zuul renders a preview documentation and store it to Swift object storage. +18) A member conduct a formal review of the documentation changes. +19) The Pull Request will be approved by the member. +20) After the review has been approved, the Squads product owner or a privileged squad member can add the ``gate`` label to the Pull Request. +21) Zuul builds and releases the final documentation on production environment: https://docs.otc.t-systems.com/ diff --git a/doc/source/training/index.rst b/doc/source/training/index.rst new file mode 100644 index 0000000..1fc19ec --- /dev/null +++ b/doc/source/training/index.rst @@ -0,0 +1,9 @@ +Trainings +========= + +.. toctree:: + :maxdepth: 1 + + helpcenter_training/index + apimon_training/index + sd2_training/index diff --git a/doc/source/training/sd2_training/contact.rst b/doc/source/training/sd2_training/contact.rst new file mode 100644 index 0000000..8552d03 --- /dev/null +++ b/doc/source/training/sd2_training/contact.rst @@ -0,0 +1,21 @@ +Contact - Whom to address for Feedback? +======================================= + +In case you have any feedback, proposals or found any issues regarding the +Status Dashboard EpMon or CloudMon, you can address them in the corresponding GitHub +OpenTelekomCloud-Infra repositories or StackMon repositories. + +Issues or feedback regarding the **ApiMon, EpMon, Status Dashboard, Metric +processor** as well as new feature requests can be addressed by filing an issue +on the **Gihub** repository under +https://github.com/opentelekomcloud-infra/stackmon-config + +If you have found any problems which affects the **internal dashboard design** +please open an issue/PR on **GitHub** +https://github.com/stackmon/apimon-tests + +If there is another general issue/demand/request try to locate proper repository in +https://github.com/orgs/stackmon/repositories + +For general questions you can write an E-Mail to the `Ecosystems Squad +`_. \ No newline at end of file diff --git a/doc/source/training/sd2_training/dashboards.rst b/doc/source/training/sd2_training/dashboards.rst new file mode 100644 index 0000000..0ef681d --- /dev/null +++ b/doc/source/training/sd2_training/dashboards.rst @@ -0,0 +1,100 @@ +==================== +Dashboard Management +==================== + +As explained in previous pages, the resulting metrics of the configured +monitor plugins (mainly of EpMon, but possibly also from other plugins) +are first stored in a Graphite time series database, befor they are +furthe rprocessed as flags and semaphores for the actual public dashboard. + +However, sometimes Service Engineers or Service Managers benefit from +deeper inspection of this time series data for debugging purposes. +Therefore a Grafana frontend may be used to visualize and drill down +the data. The entrypoint to a set of predefined dahboards is: + + https://dashboard.tsi-dev.otc-service.com/dashboards/f/CloudMon/cloudmon + +The authentication to this dashboard is only available for OTC staff member. +It is managed by Keycloak which in turn utilizes the OTC LDAP directory. + +The Dashboards are grouped by the type of service: + + - The **Squad Flag and Health** dashboard provides a high level overview + of the service health and flag metric status for each service of a + squad, respectively. + - The **Cloud Service Statistics** dashboard monitors the health of each + endpoint url listed by an EpMon configuration entry. + - Dashboards can be replicated and customized for individual squad needs. + +The Cloud Service Statistics dashboards honor the ``Environment`` (target +monitored platform) and ``Zone`` (monitoring source location) variables +at the top of each dashboard so these views can be adjusted based on +chosen value. + +All the Squad Flag And Health dashboards support Environment (target +monitored platform) variables at the top of each dashboard. + + +Squad Flag and Health Dashboard +=============================== + +The dashboard provides deeper insight in Metric Processor generated metrics. +Flag panels provide information whether service has exceeded a threshold +of a predefined flag metric type. Health panels provide information about +resulting service health status based on evaluated flag metrics. + +The resulting flag values are visualized in state timeline panels with the +following values: + +- 0 - flag metric is not breaching the defined threshold. +- 1 - flag metric is breaching the defined threshold. + +The resulting health values are visualized and mapped in state timeline +panels with the following values: + +- 0 - Service operates normally. +- 1 - Service has a minor issue resulting from defined reached flag metric(s). +- 2 - Service has an outage resulting from defined reached flag metrics(s). + +Example at https://dashboard.tsi-dev.otc-service.com/d/s75qyOU4z/compute-flags?orgId=1 + +.. image:: training_images/flag_and_health_dashboard.png + + +Cloud Service Statistics dashboard +================================== + +The Cloud Service Statistics dashboards uses metrics from GET query +requests towards OTC platform (:ref:`EpMon Overview `) +and visualize it in: + + - API calls duration per each URL query. + - API calls duration (aggregated). + - API calls response codes. + +Example at https://dashboard.tsi-dev.otc-service.com/d/b4560ed6-95f0-45c0-904c-6ff9f8a491e8/sfs-service-statistics?orgId=1&refresh=10s + +.. image:: training_images/cloud_service_statistics.png + + +Custom Dashboards +================= + +The dashboards described above are predefined and read-only. Further +customization is currently possible via system-config in GitHub: + + https://github.com/stackmon/apimon-tests/tree/main/dashboards/grafana + +The predefined simplified dashboard panel in YAML syntax is defined in +the Stackmon Github repository: + + https://github.com/stackmon/apimon-tests/tree/main/dashboards + +Dashboards can be customized also just by copy/save function directly in +Grafana. The whole dashboard can be saved under new name and then edited +without any restrictions. + +This approach is valid for testing proofs of concept, temporary solutions, +and investigations but should not be used as permanent solution as +customized dashboards which are not properly stored on Github repositories +might be permanently deleted in case of full dashboard service re-installation. \ No newline at end of file diff --git a/doc/source/training/sd2_training/databases.rst b/doc/source/training/sd2_training/databases.rst new file mode 100644 index 0000000..42ea16e --- /dev/null +++ b/doc/source/training/sd2_training/databases.rst @@ -0,0 +1,160 @@ +.. _sd2_metric_databases: + +================ +Metric Databases +================ + +Metrics are stored in Graphite time series database in different databases: + + - cloudmon-metrics + - cloudmon + + +cloudmon database +================= + + +EpMon data are stored in the clustered Graphite TSDB. +Metrics emitted by the processes are gathered in the +row of statsd processes which aggregate metrics to 10s precision. + + ++---------------------+-----------------------------------------------------------------------------------------------+ +| Parameter | Value | ++=====================+===============================================================================================+ +| Grafana Datasource | cloudmon | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Database type | time series | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Main namespace | stats | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Metric type | OpenStack API metrics (including otcextensions) collecting response codes, latencies, methods | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Database attributes | "timers", "counters", "environment name", "monitoring location", "service", "request method", | +| | "resource", "response code", "result", custom metrics, etc | ++---------------------+-----------------------------------------------------------------------------------------------+ +| result of API calls | attempted | +| | passed | +| | failed | ++---------------------+-----------------------------------------------------------------------------------------------+ + + +.. image:: training_images/graphite_query.png + + +All metrics are under "stats" namespace: + +Under "stats" there are following important metric types: + +- counters +- timers +- gauges + +Counters and timers have following subbranches: + +- openstack.api → pure API request metrics + +Every section has further following branches: + +- environment name (production_regA, production_regB, etc) + + - monitoring location (production_regA, awx) - specification of the environment from which the metric is gathered + + +openstack.api +------------- + +OpenStack metrics branch is structured as following: + +- service (normally service_type from the service catalog, but sometimes differs slightly) + + - request method (GET/POST/DELETE/PUT) + + - resource (service resource, i.e. server, keypair, volume, etc). Sub-resources are joined with "_" (i.e. cluster_nodes) + + - response code - received response code + + - count/upper/lower/mean/etc - timer specific metrics (available only under stats.timers.openstack.api.$environment.$zone.$service.$request_method.$resource.$status_code.{count,mean,upper,*}) + - count/rate - counter specific metrics (available only under stats.counters.openstack.api.$environment.$zone.$service.$request_method.$resource.$status_code.{count,mean,upper,*}) + + - attempted - counter for the attempted requests (only for counters) + - failed - counter of failed requests (not received response, connection problems, etc) (only for counters) + - passed - counter of requests receiving any response back (only for counters) + + +cloudmon-metrics database +========================= + + +Cloudmon data are stored in the clustered Graphite TSDB. +Metrics are emitted by the Metric Processor. +Metric Processor is processing the cloudmon metrics (from EpMon) and based on defined flag metrics (https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/mp-prod/conf.d/flag_metrics.yaml) +and defined thresholds(https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/mp-prod/conf.d/metric_templates.yaml) finally produces the health metrics +(https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/mp-prod/conf.d/health_metrics.yaml) with different impact. +Final health metrics are then sent to Status Dashboard to visualize them as semaphore lights. + + + ++---------------------+-----------------------------------------------------------------------------------------------+ +| Parameter | Value | ++=====================+===============================================================================================+ +| Grafana Datasource | cloudmon-metrics | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Database type | time series | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Main namespace | stats | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Metric type | Metric Processor produces flag metric values (0,1) and health metric values (0,1,2) | ++---------------------+-----------------------------------------------------------------------------------------------+ +| Database attributes | "health", "flag", "environment name", "service", "service type", "flag metric type" | ++---------------------+-----------------------------------------------------------------------------------------------+ +| result | 0 | +| | 1 | +| | 2 | ++---------------------+-----------------------------------------------------------------------------------------------+ + + +.. image:: training_images/mp_query.png + + +Based on the type of metric All metrics are under "stats" namespace: + +Under "cloudmon-metrics" there are following important metric types: + +- flag +- health + +- environment name (production_regA, production_regB, etc) + + +flag metrics +------------ + +flag metrics branch is structured as following: + +- environment name (production_regA, production_regB, etc) + + - service type (service type from the service catalog) + + - flag metric type (api_slow, api_down, api_success_rate_low, ...) + +flag metrics contain following values: + +- 0 - flag metric is not breaching the defined threshold +- 1 - flag metric is breaching the defined threshold + + +Health metrics +-------------- + +Health metrics branch is structured as following: + +- environment name (production_regA, production_regB, etc) + + - service (cloud service) + +Health metrics contain following values: + +- 0 - Service operates normally +- 1 - Service has a minor issue resulting from defined reached flag metric(s) +- 2 - Service has an outage resulting from defined reached flag metrics(s) \ No newline at end of file diff --git a/doc/source/training/sd2_training/epmon_checks.rst b/doc/source/training/sd2_training/epmon_checks.rst new file mode 100644 index 0000000..8d6abca --- /dev/null +++ b/doc/source/training/sd2_training/epmon_checks.rst @@ -0,0 +1,95 @@ +.. _sd2_epmon_overview: + +============================ +Endpoint Monitoring overview +============================ + + +EpMon is a standalone Python based process targeting every OTC service. It +looks up the services from the service catalog and sends GET requests to +the configured endpoints. + +While performing extensive tests like provisioning a server provides a great +coverage and deep insights, it is a rather expensive and complex activity. +It can only be performed every so often and leaves certain gaps on the +timescale of monitoring. To cover this gap, the EpMon plugin sends +GET-requests to a list of URLs endpoints discovered from the OTC service +catalog and augmented by simple paths like ``/server``. Such requests +are cheap and can be sent in a loop, i.e. every five seconds. +The latency and the HTTP status code of those calls are captured, +stored in a time series database, and further processed by the Metric +Processor. + +Currently EpMon configuration is located in the project ``stackmon-config``: + + https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/epmon/config.yaml + +It defines the query HTTP targets (urls) for every single OTC service. + +An entry in the OTC service catalog is a prerequisite to enable service +to be queried by EpMon: + + https://git.tsi-dev.otc-service.com/ecosystem/service_catalog + +If there are multiple entries in service catalog, such service entries +can be marked for skip in case they are obsolete. EpMon ``config.yaml`` +only defines the service queries but doesn't say how and when to use them. +For actual use across different monitoring sources and targets the +configuration matrix is defined in: + + https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/config.yaml + +The following example configures the autoscaling service (``as``) in +EpMon and adds four paths to the service endpoint (three URL path +for the Swisscloud): + +.. code:: yaml + + as: + service_type: as + sdk_proxy: auto_scaling + urls: + - / + - /scaling_group + - /scaling_configuration + - /scaling_policy + as_swiss: + service_type: as + sdk_proxy: auto_scaling + urls: + - / + - /scaling_group + - /scaling_configuration + as_skip_v1: + service_type: asv1 + urls: [] + +There are three separate items defined for the autoscaling service: + +- The ``as`` entry is the default. It is used for the public OTC regions. +- The ``as_swiss`` entry defines the specific settings for the Swisscloud. +- The ``as_skip_v1`` entry is entry to be skipped from EpMon. + +By default all entries in service catalog are triggered for EpMon. + +The mandatory parameter for all entries is ``service_type``. This has to +match the ``service_type`` entry in the OTC service catalog. + +Another important parameter is ``sdk_proxy``. This attribute identifies +which otcextension module should be used for the execution of HTTP GET-queries. + +The most important parameter is ``urls``. It defines a list of URLs which +EpMon triggers for this service. As ``service_type`` is known then not +full url is required to be defined but only required is its path which +appears after the predefined url from the OTC service catalog. + +If some specific service (or some specific service version) is should be +skipped from endpoint monitoring, the value of the ``urls`` key has to +be set to the empty list in the EpMon configuration file. This ensures that +even default queries from service catalog are overwritten by the empty +list in this configuration. In this example, the service type asv1 (entry from +the OTC service catalog) is not being triggered by EpMon at all as it +contains an empty ``urls`` list. + +Collected response codes and response times are sent to the Graphite time +series database for further processing by the Metrics Processor. \ No newline at end of file diff --git a/doc/source/training/sd2_training/incidents.rst b/doc/source/training/sd2_training/incidents.rst new file mode 100644 index 0000000..a03bb8f --- /dev/null +++ b/doc/source/training/sd2_training/incidents.rst @@ -0,0 +1,68 @@ +.. _sd2_incidents: + +========= +Incidents +========= + +TODO +Incidents inform customers about the reason why some cloud service has changed its status from "green" (normal operation) to any other state. + +Incidents are created under following conditions: + +- Metric Processor evaluates value 1 or 2 on health metric of specific cloud service and incident is automatically created on SD. +- Service Incident Manager (SIM) manually creates incident on SD for one or more cloud services. + +Each cloud service on SD is represented by its name and the status semaphore color icon representing its current health status. +The following states of the service can be shown on SD2: + +- Operational - green "check" mark icon +- Maintenance - blue "wrench" mark icon +- Minor Issue - yellow "cross" mark icon +- Major Issue - brown "cross" mark icon +- Service Outage - red "cross" mark icon + +These 5 states can be set manually for specific service(s) during incident creation but only 2 states (Minor issue and Service Outage) are set automatically by the Metric Processor health metrics. +Incidents are visualized in the respective color scheme on the top of the SD page. Also it's possible to navigate to the related incident via clicking on the service state icon next to the service. + +Once the service health status is changed and incident is created there's no automated clean-up of the incident and incident must be handledl by respective SIM. Only after incident is closed the service changes its state back to "green" Operation state. + +Incident manual creation process +================================ + +As mentioned besides the automated incident creation the incidents can be created manually as well. +Service incident manager must authenticate prior to be able to create an incident. +Login is ensured by Openid connect feature on page https://status.cloudmon.eco.tsi-dev.otc-service.com/login/openid + +Once logged in the new option "Open new incident" appears at top right corner of the page. + +.. image:: training_images/sd2_incident.jpg + +The incident creation process consists of these mandatory fields: + +- Incident Summary - Description of the incident +- Incident Impact - Drop-down menu of 4 service states (Scheduled Maintenance, Minor Issue, Major Issue, Service Outage) +- Affected services - List of all OTC cloud services in conjunctions with regions. One or more items can be chosen +- Start - Timestamp when incident has started + +Incident update process +======================= + +During the incident lifecycle SIM can update incident with relevant information. +The incident creation process consists of these optional fields: + +- Incident title - Change the title of the incident +- Update Message - Additional details related to the current status of the incident +- Update Status - Drop-down menu of 4 incident statuses (Analyzing incident, Fixing incident, Observing fix, Incident resolved) +- Next Update by - Timestamp when incident is expected to be updated with another information + +Incident manual closure process +=============================== + +Incident is never closed automatically. SIM needs to close the incident by changing its status during the update incident process to "Incident resolved". +After that incident disappears from the active list of incidents and service health status is changed back to "green" operational state. +Every closed incident is recorded in the Incident History. + +Incident notifications +====================== + +Status Dashboard support RSS feeds for incident notifications. The details how to setup RSS feed are described on :ref:`notifications ` page. \ No newline at end of file diff --git a/doc/source/training/sd2_training/index.rst b/doc/source/training/sd2_training/index.rst new file mode 100644 index 0000000..26a5fc8 --- /dev/null +++ b/doc/source/training/sd2_training/index.rst @@ -0,0 +1,20 @@ +=========================== +Status Dashboard 2 Training +=========================== + +.. toctree:: + :maxdepth: 1 + + onepager + introduction + workflow + status_dashboard_frontend + monitoring_coverage + epmon_checks + dashboards + metrics + databases + incidents + notifications + recorded_session + contact diff --git a/doc/source/training/sd2_training/introduction.rst b/doc/source/training/sd2_training/introduction.rst new file mode 100644 index 0000000..2c1bef5 --- /dev/null +++ b/doc/source/training/sd2_training/introduction.rst @@ -0,0 +1,93 @@ +====================================== +Introduction to the Status Dashboard 2 +====================================== + +The Open Telekom Cloud is represented to users and customers by the API +endpoints and the various services behind them. Customers are +interested in a reliable way to check and verify if those services are actually +available to them via the Internet. + +The Status Dashboard 2 (SD2) is a service facility monitoring of all OTC +services, intended for customers to grasp a quick overview of the service +availability. It comprises of a set of **monitoring zones**, each +monitoring services of an **monitoring environment** (a. k. a. regions +like eu-de, eu-nl, etc.). The mapping of monitoring zones to monitoring +sites is configured in a mesh matrix to validate internal as well as +external connections to cloud. + +Monitoring can be a tricky process, as there are many approaches of how +deep, realistic, practical, synthetic, and reliable to measure the systems +and services. The SD2 provides a reliable, quick, and comprehensive view +on the OTC, and makes some opinionated, deliberate simplifications. This +document guides through the architecture and necessary steps to maintain +the monitoring process by all OTC staff roles involved in providing a +service. + +Key features of the SD2 framework: + + - Developed to **supervise the 24/7 availability** of the public APIs + of the OTC platform. + - SD2 **sends GET-requests that list resources** to API-endoints. It + does explicitly not simulate more complex, multi-stage use-cases. + - Answers to such requests (status, roundtrip time) are grouped by + **service** and considered as **metrics**. They are sent to the + **Metric Processor**. + - The Metric Processor maps the metrics to **flags**, that are raised + for certain situations, like request probes not being answered (API + down), a majority not answering within a defined threshold period + (API slow) or other situations. + - Based on a combination of raised flags and their severity, the Metric + Processor calculates health metrics as **semaphores**. No flags result in + a green semaphore, minor issues result in a yellow semaphore (service + degradation), while severe situations lead to red semaphores (service + unavailable). + - The **SD2 frontend** visualizes health of the service based on the + semaphores on a website. + - Each non-green semaphore raises automatically an **issue** and displays + it on the website. MODs and/or service squad owners should now take over. + - It requires the **manual intervention** of the affected service's owners + to review, document, resolve, and eventually delete the issue condition. + +.. image:: https://stackmon.github.io/assets/images/solution-diagram.svg + + +SD2 Architecture Summary +------------------------ + + - The **EpMon** plugin (end point monitoring) sends several HTTP query + requests to service endpoints and generates metrics. + - HTTP request metrics (status code, round trip time) are generated by + OpenStack SDK and are collected by Statsd. + - A time series database (Graphite) pulls metrics from Statsd. + - The Metric Processor (MP) processes the requests metrics and flags + certain circumstances. Based on defined rules and thresholds, the + MP computes resulting service health metrics (semaphores). + - The MP raises an issue for any non-green semaphore and stores it in + the SQL-based incident database that is part of the frontend component. + - The Status Dashboard frontend visualizes the incidents on a website. + - Grafana dashboards visualize data from Graphite as well as from the + Metric Processor for OTC staff members. + - Service Levels are computed based on how long incidents last. + + +SD2 features +------------ + +SD2 comes with the following features: + +- Service health with 5 service statuses (three generated + semaphores, one custom semaphore light, one maintenance status). +- HTTP GET-requests for Endpoint Monitoring. +- Custom metrics and custom thresholds. +- Incidents are generated once non-green semaphores are detected. + Alternatively, incidents can be raised manually as maintence + downtimes. +- All OTC-environments including eu-de, eu-nl, and eu-sc2 are covered. +- The monitoring environments are decoupled from the monitoring zones + obtaining the metrics and include eu-de, eu-nl, eu-sc2, and GCP. +- Linked Grafana dashboards support service squad members and MODs to + understand the root cause for service health changes. +- Each service squad can control and manage their metrics as well as + dashboards individually. +- All parameters configured from single place (stackmon-config) in + human readable form (YAML). \ No newline at end of file diff --git a/doc/source/training/sd2_training/metrics.rst b/doc/source/training/sd2_training/metrics.rst new file mode 100644 index 0000000..45e633f --- /dev/null +++ b/doc/source/training/sd2_training/metrics.rst @@ -0,0 +1,164 @@ +.. _sd2_metrics_definition: + +======= +Metrics +======= + +Status Dashboard distinguish 2 types of metrics: + +- Metrics emitted by EpMon +- Metrics by Metrics Processor + + +- The EpMON plugin internally invokes method calls to **OpenStack SDK + libraries.** They in turn generate metrics about each API call they do. This + requires some special configuration in the clouds.yaml file (currently + exposing metrics into statsd and InfluxDB is supported). For details refer + to the `config + documentation `_ + of the OpenStack SDK. The following metrics are captured: + + - response HTTP code + - duration of API call + - name of API call + - method of API call + - service type + +- Based on EpMon metrics the Metric Processor **emits flag and health metrics**. The following + metrics are captured: + + - environment + - service + - service type + - flag metric type + - resulting value (0, 1, 2) + +Custom metrics: + +Besides default flag and health metrics some services might require specific approach +and evaluation of how to aggregate and combine the HTTP query metrics and +whether custom thresholds must be applied. +For such cases, the custom metrics might be introduced in Metric Processor configuration files: +https://github.com/opentelekomcloud-infra/stackmon-config/tree/main/mp-prod/conf.d + + +More details how to query metrics from databases are described on :ref:`Metric +databases ` page. + + +Configuration of Flag metrics +============================= + +Flag metrics are defined by 2 configuration files: + +https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/mp-prod/conf.d/flag_metrics.yaml + +https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/mp-prod/conf.d/metric_templates.yaml + +Example of Autoscaling service entry in flag_metric.yaml: + +.. code:: yaml + + ### AutoScaling + - name: "api_down" + service: "as" + template: + name: "api_down" + environments: + - name: "production_eu-de" + - name: "production_eu-nl" + + - name: "api_slow" + service: "as" + template: + name: "api_slow" + environments: + - name: "production_eu-de" + - name: "production_eu-nl" + + - name: "api_success_rate_low" + service: "as" + template: + name: "api_success_rate_low" + environments: + - name: "production_eu-de" + - name: "production_eu-nl" + + +For each service set of flag metrics are defined. These metrics are used by Metric Processor to define the health metric for respective service. +Flag metric is represented by its name, service attribute (relation to EpMon service definition), +template reference (which exact query with which threshold is defined for this metric) +and environments entry (list of environments where this metric is applicable). + +Example of template metric definition in metric_template.yaml: + +.. code:: yaml + + api_success_rate_low: + query: "asPercent(sumSeries(stats.counters.openstack.api.$environment.*.$service.*.*.{2*,3*,404}.count), sumSeries(stats.counters.openstack.api.$environment.*.$service.*.*.attempted.count))" + op: "lt" + threshold: 90 + api_down: + query: "asPercent(sumSeries(stats.counters.openstack.api.$environment.*.$service.*.*.failed.count), sumSeries(stats.counters.openstack.api.$environment.*.$service.*.*.attempted.count))" + op: "eq" + threshold: 100 + api_slow: + query: "consolidateBy(aggregate(stats.timers.openstack.api.$environment.*.$service.*.*.*.mean, 'average'), 'average')" + op: "gt" + threshold: 300 + + +Templates define how the flag metrics are evaluated. + +- "query" parameter defines query to graphite time-series database which stores collected metrics from EpMon. + For the details how the query is structured in Graphite TSDB, refer to :ref:`Metric databases ` page. + +- "op" parameter defines the operation for comparison with the threshold (lt - lower than, gt - greater than, eq - equal to, ...) + +- "threshold" parameter defines the value which is used to compare query with. + +For example: + +api_slow metric template defines query whether the average and consolidated aggregation of latencies of all GET queries +for specific service is greater than 300 milliseconds. If yes the value of the flag metric will be 1. +if no the value of the fla metric will be 0. + +Metric template configuration introduces pre-defined metric queries but in case some service needs different approach, +the custom metric can be introduced here as well. + + +Configuration of Health metrics +=============================== + +Once the flag metrics are defined. Metric Processor evaluates health metric based on conditions defined in health_metrics.yaml. +https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/mp-prod/conf.d/health_metrics.yaml + + +Example of DEH health metric definition: + +.. code:: yaml + + ## Compute + ### DEH + deh: + service: deh + component_name: "Dedicated Host" + category: compute + metrics: + - deh.api_down + - deh.api_slow + - deh.api_success_rate_low + expressions: + - expression: "deh.api_slow || deh.api_success_rate_low" + weight: 1 + - expression: "deh.api_down" + weight: 2 + +Configuration consists of following attributes: + +- service - service name (relation to EpMon) +- component_name - component name (relation to SD catalog) +- category - service category (relation to SD catalog) +- metrics - which metrics apply for health metric evaluation (relation to flag metrics) +- expressions - definition of the resulting health metric value by the defined expression. 1 means minor issue. 2 means outage. + diff --git a/doc/source/training/sd2_training/monitoring_coverage.rst b/doc/source/training/sd2_training/monitoring_coverage.rst new file mode 100644 index 0000000..e86a963 --- /dev/null +++ b/doc/source/training/sd2_training/monitoring_coverage.rst @@ -0,0 +1,211 @@ +=================== +Monitoring coverage +=================== + +While monitoring the cloud services of the OTC (which we call +monitoring environments) is convenient and effective most of +the time, it is obvious that in corner cases the servers performing +the actual monitoring (which we call monitoring zones) should +include also externa zones. Who monitors whom (and how) can be +configured in a matrix definition: + + https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/config.yaml + + +Monitoring Environments +----------------------- + +These targets are covered by the SD2 monitoring setup and are +displayed in separate tabs (or on separate pages for the Swisscloud): + +* eu-de, +* eu-nl, and +* eu-ch2 (Swisscloud). + + +Monitoring Zones +---------------- + +From these zones the monitoring probes are sent to the targets: +* Inside OTC (eu-de, eu-ch2) +* Outside OTC (Swisscloud) + + +Scope of monitoring +------------------- + +The SD2 is a special application of the more generic Stackmon project +and utilizes several plugins to collect its metrics: + +* HTTP-GET queries are sent to service API endpoints: + * applies to all services from the service catalog, + * multiple GET queries may be configured per service. + +* Static Resources + * not yet implemented in SD2 (projected for 1Q2024), + * specific services, + * availability of the resource or resource functionality. + +* Global resources + * not yet implemented in SD2 (projected for 2024), + * OTC console, + * OTC helpcenter, + * OTC community portal, + * OTC public website. + + +Example configuration of the monitoring matrix and covered services: + +.. code:: yaml + + # Mapping of environments to test projects + - env: production_eu-de + monitoring_zone: eu-de + db_entry: apimon.apimon + plugins: + - name: apimon + schedulers_inventory_group_name: schedulers + executors_inventory_group_name: executors + tests_project: apimon + tasks: + - scenario1_token.yaml + - name: epmon + epmon_inventory_group_name: epmon_de + cloud_name: production_eu-de # env in zone has few creds. We need to pick one + config_elements: + - antiddos + - antiddos_skip_bad_type + - as + - as_skip_v1 + - bms_skip + - cce_skip_unver + - cce + - ces + - ces_skip_v1 + - compute + - css + - cts_skip_unver + - cts + - data_protect_skip + - database_skip + - dcs + - dcs_skip_v1 + - dds + - deh + - dis_skip_unver + - dis + - dms + - dms_skip_v2 + - dns + - dws + - dws_skip_v1 + - identity + - image + - kms_skip_unver + - kms + - mrs + - nat + - network + - object_skip + - object_store + - orchestration + - rds_skip_unver + - rds_skip_v1 + - rds + - sdrs + - sfsturbo + - share + - smn + - smn_skip_v2 + - volume_skip_v2 + - volume + - env: production_eu-nl + monitoring_zone: eu-de + db_entry: apimon.apimon + plugins: + - name: apimon + schedulers_inventory_group_name: schedulers + executors_inventory_group_name: executors + #epmons_inventory_group_name: epmons + tests_project: apimon + tasks: + - scenario1_token.yaml + - name: epmon + epmon_inventory_group_name: epmon_de + cloud_name: production_eu-nl # env in zone has few creds. We need to pick one + config_elements: + - antiddos + - antiddos_skip_bad_type + - as + - as_skip_v1 + - bms_skip + - cce_skip_unver + - cce + - ces + - ces_skip_v1 + - compute + - css + - cts_skip_unver + - cts + - data_protect_skip + - database_skip + - dcs + - dcs_skip_v1 + - dds + - deh + - dis_skip_unver + - dis + - dms + - dms_skip_v2 + - dns + - dws + - dws_skip_v1 + - identity + - image + - kms_skip_unver + - kms + - mrs + - nat + - network + - object_skip + - object_store + - orchestration + - rds_skip_unver + - rds_skip_v1 + - rds + - sdrs + - sfsturbo + - share + - smn + - smn_skip_v2 + - volume_skip_v2 + - volume + +Note that Service Managers or Engineers usually don't need to +touch this configuration. Details should be negotiated with +Platform Engineers. + +The attribute ``env`` defines the target for monitoring (which +region is to be monitored). The attribute ``monitoring_zone`` +defines the source of monitoring (from which region the monitoring +will be triggered). + +Note that this configuration covers not only SD2 component, but +also the even more generic Stackmon framework. It is plugin based +so additional plugins can be added. Currently two plugins are enabled: + +- apimon +- epmon + +Apimon plugin triggers scenario-based Ansible playbooks which +simulate the customer use-cases including also creation of +resources (POST requests). Currently only one scenario is enabled +for token authorization (scenario1_token.yaml). As the SD2 only +evaluates the HTTP GET metrics other scenarios are not yet enabled. +Playbooks are stored on GitHub at: + + https://github.com/stackmon/apimon-tests/tree/main/playbooks + +The EpMon plugin defines which service entries are used in which +specific environment. Services not present in an environment +won't have entry in this config as well, respectively. \ No newline at end of file diff --git a/doc/source/training/sd2_training/notifications.rst b/doc/source/training/sd2_training/notifications.rst new file mode 100644 index 0000000..4358a9d --- /dev/null +++ b/doc/source/training/sd2_training/notifications.rst @@ -0,0 +1,25 @@ +.. _sd2_notifications: + +============= +Notifications +============= + +Status Dashboard application comes with a RSS feeds to provide the information about the incidents + +Current RSS Feeds based on the "feedgen" library. +https://pypi.org/project/feedgen/ + +RSS feeds support region based queries and service name and service category based queries. + +Example of region based query: + +https://status.cloudmon.eco.tsi-dev.otc-service.com/rss/?mt=EU-DE + +Example of service category based query: + +https://status.cloudmon.eco.tsi-dev.otc-service.com/rss/?srvc=Compute + +Examples of region and service name based query: + +https://status.cloudmon.eco.tsi-dev.otc-service.com/rss/?mt=EU-DE&srv=Data%20Warehouse%20Service + diff --git a/doc/source/training/sd2_training/onepager.rst b/doc/source/training/sd2_training/onepager.rst new file mode 100644 index 0000000..3d1d330 --- /dev/null +++ b/doc/source/training/sd2_training/onepager.rst @@ -0,0 +1,243 @@ +OTC Status Dashboard 2: Cheat-Sheet for Squad Service Managers +============================================================== + +The Status Dashboard 2 (SD2) is a service facility monitoring of all OTC +services, intended for customers to grasp an overview of the service +availability. It comprises of a set of **monitoring zones**, each +monitoring services of an **monitoring environment** (a. k. a. regions +like eu-de, eu-nl, etc.). The mapping of monitoring zones to monitoring +sites is configured with an HA approach in mind by the Ecosystem Squad +and is not described in technical detail in this document. + +Additionally, the web-based Dashboard itself serves the monitored data +in a frontend component visible by OTC customers. The general assumption +of the SD2 is that “no news is good news”. Technically speaking the SD2 +doesn’t receive any monitoring metrics, but only **incidents**. Once the +SD2 receives such an incident, the associated service is marked with +yellow or red semaphores, otherwise every services stays with a green +semaphore. + +Each squad should appoint one or more colleagues for the role of a +**Service Incident Engineers (SIE)**. The SIEs define the exact +conditions when a yellow or red semaphore should be raised. This +document is intended for them. + +As a secondary target group this document may also be useful for +**Service Incident Managers (SIM)**. It’s the role’s responsibility to +react on incoming incident, initiate mitigation activities, explain the +situation to customers, and eventually close incidents once they’re +resolved. For SIMs it might be useful to understand *why* incidents are +raised, but they may not need to know *how* exactly this happens. + +Simplified architectural overview and data flow +----------------------------------------------- + +The SD2 is a specific application of the much more genral Stackmon +framework for cloud monitoring. It is licensed as open source software, +initiated by the OTC, and developed together with the Community. Due to +this design, the monitoring data flows through several stages. Most of +them can be configured and customized to a great deal to serve for many +different purposes. However, SD2 comes with a number of assumptions and +pre-configurations to reduce complexity for SIEs and SIMs. + +The data flows through these stages: A plugin collects the raw metrics +from the live systems of the OTC. For the SD2 the EPMon-plugin is used, +which is an abbreviation for “endpoint monitoring”. This means that the +plugin sends HTTP-GET-requests to API endpoints that are listed in the +OTC service catalogue. Typically simple “list” requests are queried, and +no actual resources are created or modified by the action. The +EPMon-plugin records only the status code and the round-trip-time for +the response. There is a maximum timeout configured. The results of the +probes are stored in a TSDB implemented by Graphite. By means of some +Graphit-queries, the raw data is aggregated, resulting in several +**flags**. For example, if less than 90% of all queries to the +ABC-service in the past 15 minutes exceeded a threshold of 300ms, a flag +named “abc_unreliable” could be raised.Another example is ……………… . The +**metric processor** further aggregates the flags into minor incidents +(yellow) and major outages (red). The yellow semaphores mean that a +service is degraded, dropping some requests or running into occasional +timeouts. However, the serice itself is still repsonding. Red semaphores +indicate that a service is not available anymore at all. Note, that this +is a very informal description of the semantics. The details are defined +in the service-specific configuration items covered later in this +document. Only if the metric processor actually creates an incident (of +whatever color), it is transitted to and displayed in the SD2 website. +The incident is listed on the website and won’t go away automatically. +It requires the manual intervention of the SIM to mark the issue as +resolved. The frontend supports the SIM in this process as she or he may +report intemdiate progress statements to the customers. The service data +of red semaphores is used to calculate an SLA value according to the +service description. + +The configuration of the backend is based on configuration files in Git +repositories hosted on GitHub (for the subject-based configuration) and +on Gitea (for OTC-related non-public data). Changes are requested and +tracked by GitOps methods. + +The components are distributed (to several regions and a non-OTC +platform, GCP) and designed redundantly to increse resillience against +outages of the platform. + +The SD2 frontend is connected to a Keycloak authentication proxy +instance, provindg access to users listed in the OTC-LDAP directory or +optionally authenticted by GitHub as an external ID provider. The SD2 +stores nor processes any personal data, except for authtication when +personalized accounts are used. + +Accessing the platform and the configuration +-------------------------------------------- + +The SD2 for the regions eu-de and eu-nl is accessible via the Internet +at: + +:: + + https://status.cloudmon.eco.tsi-dev.otc-service.com/ + +The SD2 instance for the Swiss Cloud is available at: + +:: + + https://status-ch2.cloudmon.eco.tsi-dev.otc-service.com/ + +The access for SIMs to edit or resolve incidents is available when you +extend the mentioned dashboard URLs by “/login/openid”. + +The public configuration repository for the eu-de and eu-nl is at: + +:: + + https://github.com/opentelekomcloud-infra/stackmon-config + +Consult the upcoming sections to configure any service metrics, flags, +and semaphores. + +Customizing metrics +------------------- + +The actual data flow is slightly more complex than described in the +abstract sections before, but thankfully there are already working +defaults in place, so that only little configuration has to be touched. +All configuration is formatted as YAML and can be found in the +repository + +:: + + https://github.com/opentelekomcloud-infra/stackmon-config + +There are a couple of questions to be answered to follow the metrics +through the subsystems. All metrics processed by the EPMon plugin are +based on the service catalogue of the OTC (see “openstack catalog list” +for reference). + +Question 1: What HTTP GET queires should be sent to the service? + +https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/epmon/config.yaml + +This file lists under the top-level key ``elements`` the services. The +important attribute here is a list of ``urls``, that get appended to the +service endpoint. With this list several aspects of the service can be +expressed. Only “list-type” queries should be listed here as the plugin +just sends a GET request and discards the actual response body. + +.. code:: yaml + + antiddos: # simple regular antiddos + service_type: antiddos # service_type in the catalog + sdk_proxy: anti_ddos # how SDK proxy is named + urls: # which urls to test + - / + - /antiddos + - /antiddos/query_config_list + - /antiddos/default/config + - /antiddos/weekly + +If a service catalog entry should – for whatever reasons – not be +queried, assign an empty list to the ``urls``\ attribute. + +Question 2: What flags should be defined for a service? + +https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/mp-prod/conf.d/flag_metrics.yaml + +Under the top-level attribute ``flag metrics`` a long list of +``services`` are associated with a condition, which is abstracted by a +``template``. Effectively and as a default momst often three flags are +defined: ``api_down``, ``api_slow``, and ``api_success_rate_low``. The +“implementation” of the flag’s semantics are externalized in templates +and contain complex Graphite queries. The implementation is not +important in the context of this primer. + +.. code:: yaml + + ### Anti-DDoS + - name: "api_down" + service: "antiddos" + template: + name: "api_down" + environments: + - name: "production_eu-de" + - name: "production_eu-nl" + + - name: "api_slow" + service: "antiddos" + template: + name: "api_slow" + environments: + - name: "production_eu-de" + - name: "production_eu-nl" + + - name: "api_success_rate_low" + service: "antiddos" + template: + name: "api_success_rate_low" + environments: + - name: "production_eu-de" + - name: "production_eu-nl" + +The flag ``api_down`` means that all queries of a test series have +failed without exception. The flag ``api_slow`` is raised when the +average RTT in a test series took longer than 300 ms. The flag +``api_success_rate_low`` is similar to ``api_down``, but a bit relaxter, +as it is raised only if 90% or less of the queries succeed. In the +template file +(https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/mp-prod/conf.d/metric_templates.yaml) +there are three additional flag definitions listed, but they are +currently not widely used. Custom queries could theoretically be added +with their own templates, but this is beyond the scope of this document. + +The flags are referenced in upcoming files as *service*._name_, for +example as ``antiddos.api_slow`` in the second example instance. + +Question 3: What is the impact of one or more raised flags? + +https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/mp-prod/conf.d/health_metrics.yaml + +.. code:: yaml + + ### Anti-DDoS + antiddos: + service: antiddos + component_name: "Anti DDoS" + category: database + metrics: + - antiddos.api_down + - antiddos.api_slow + - antiddos.api_success_rate_low + expressions: + - expression: "antiddos.api_slow || antiddos.api_success_rate_low" + weight: 1 + - expression: "antiddos.api_down" + weight: 2 + +In this file, the top-level ``health_metrics`` key holds a long list of +semaphores. The value of the semaphores are mapped to the colors, 1 +meaning yellow and 2 resulting in a red incident or outage, +respectively. The configuration items define how this mapping is done: +The ``metrics`` from the previous section are listed as a declaration, +the key ``expressions`` specify the actual mapping. Typically not much +needs to be changed here unless no new flags are introduced or the +semantics of red and yellow should be changed. + +Should the outcome of this mapping result in a yellow or red semaphore +an incident for the corresponding service is created, sent to the SD2 +frontend and displayed. diff --git a/doc/source/training/sd2_training/recorded_session.rst b/doc/source/training/sd2_training/recorded_session.rst new file mode 100644 index 0000000..f8bd925 --- /dev/null +++ b/doc/source/training/sd2_training/recorded_session.rst @@ -0,0 +1,9 @@ +.. _recorded_session_sd2: + +================ +Recorded Session +================ + +Onboarding session for Compute squad from 04.10.2023 has been recorded and videos are available on OBS. + +`Recorded session `_ diff --git a/doc/source/training/sd2_training/status_dashboard_frontend.rst b/doc/source/training/sd2_training/status_dashboard_frontend.rst new file mode 100644 index 0000000..5bc962f --- /dev/null +++ b/doc/source/training/sd2_training/status_dashboard_frontend.rst @@ -0,0 +1,74 @@ +=========================== +Status Dashboard 2 Frontend +=========================== + +The web based frontend of the SD2 provides public (and internal, +after authentication) status information of OTC cloud services +across all configured regions. It supports these features: + +- Displays the service health through five service status. +- Authentication by OpenID connect (which in turn is connected + to the OTC LDAP directory). +- Several service are grouped into categories. +- Regions - several services are existing in regions. +- Incidents - entry about issues affecting certain regions and + certain services. +- Support of all OTC environments +- Incident data is available through an API. +- RSS notification (for the OTC mobile app and other integrations). +- SLA view of the services. +- Incident history. + +Two Status Dashboard portals are available: + +- public status dashboard: https://status.cloudmon.eco.tsi-dev.otc-service.com/ +- hybrid status dashboard: https://status-ch2.cloudmon.eco.tsi-dev.otc-service.com/ + +Service Health View +=================== + +.. image:: training_images/sd2_frontend.jpg + +From the architecture POV Status Dashboard is a Flask based +web server serving API and rendering web content with a +PostgreSQL database. The project source is available at +https://github.com/stackmon/status-dashboard + +Configuration of the status dashboard frontend is located +at github: https://github.com/opentelekomcloud-infra/stackmon-config/blob/main/sdb_prod/catalog.yaml +The ``catalog.yaml`` file contains definitions of the +service name, service type, service categories and regions. + +Example of AutoScaling service entry in SD catalog: + +.. code:: yaml + + - attributes: + category: Compute + region: EU-DE + type: as + name: Auto Scaling + - attributes: + category: Compute + region: EU-NL + type: as + name: Auto Scaling + +Applying Catalog Changes +======================== + +After the SD changes are merged on github repository the cloudmon operators must execute rollout steps on CloudMon platform: + +#. #~ cloudmon --config-dir prod --config-repo https://github.com/opentelekomcloud-infra/stackmon-config.git status dashboard provision +#. #~ kubectl exec into status dashboard container +#. #~ export FLASK_APP=status_dashboard.py +#. #~ flask bootstrap provision + +SLA view +======== + +SLA view https://status.cloudmon.eco.tsi-dev.otc-service.com/sla is calculated only from the "outage" service health status and provide 6 months SLA history of each service. + +.. image:: training_images/sd2_sla.jpg + +Details how to work with incidents are described on the :ref:`incidents ` page. \ No newline at end of file diff --git a/doc/source/training/sd2_training/training_images/cloud_service_statistics.png b/doc/source/training/sd2_training/training_images/cloud_service_statistics.png new file mode 100755 index 0000000..0021218 Binary files /dev/null and b/doc/source/training/sd2_training/training_images/cloud_service_statistics.png differ diff --git a/doc/source/training/sd2_training/training_images/flag_and_health_dashboard.png b/doc/source/training/sd2_training/training_images/flag_and_health_dashboard.png new file mode 100755 index 0000000..dc525ab Binary files /dev/null and b/doc/source/training/sd2_training/training_images/flag_and_health_dashboard.png differ diff --git a/doc/source/training/sd2_training/training_images/graphite_query.png b/doc/source/training/sd2_training/training_images/graphite_query.png new file mode 100755 index 0000000..83f14ea Binary files /dev/null and b/doc/source/training/sd2_training/training_images/graphite_query.png differ diff --git a/doc/source/training/sd2_training/training_images/mp_query.png b/doc/source/training/sd2_training/training_images/mp_query.png new file mode 100755 index 0000000..75ae814 Binary files /dev/null and b/doc/source/training/sd2_training/training_images/mp_query.png differ diff --git a/doc/source/training/sd2_training/training_images/sd2_data_flow.svg b/doc/source/training/sd2_training/training_images/sd2_data_flow.svg new file mode 100755 index 0000000..93e2487 --- /dev/null +++ b/doc/source/training/sd2_training/training_images/sd2_data_flow.svg @@ -0,0 +1,4 @@ + + + +

Cloudmon


Main
process
Cloudmon...
Generates full config based on public and private part
Generates full...
Execute HTTP
GET
 requests

Execute HTTP...

Statsd


Collects the
metrics
Statsd...

Cloudmon


EpMon
plugin
Cloudmon...
Send metrics to graphite
Send metrics to...
Service Squad
Servic...
Data
Sources
Data...
Create incidents based on Thresholds
Create incide...
O/M
O/M

Github


stackmon-config
repository
Github...
Pull
repository

Pull...
Management
24/7 Squad
Manage...

MP


evaluate the
service health
based on flags
MP...
Send notifications
 to MOD
Send notificati...
1
1
2
2
3
3
4
4
5
5
7
7
6
6

SD2


Shows the
service health
SD2...
Graphite TSDB



Graphite TSDB...
Metrics
Metrics

Grafana


Dashboard
Grafana...
8
8
Text is not SVG - cannot display
\ No newline at end of file diff --git a/doc/source/training/sd2_training/training_images/sd2_frontend.jpg b/doc/source/training/sd2_training/training_images/sd2_frontend.jpg new file mode 100755 index 0000000..1824a03 Binary files /dev/null and b/doc/source/training/sd2_training/training_images/sd2_frontend.jpg differ diff --git a/doc/source/training/sd2_training/training_images/sd2_incident.jpg b/doc/source/training/sd2_training/training_images/sd2_incident.jpg new file mode 100755 index 0000000..f8d18eb Binary files /dev/null and b/doc/source/training/sd2_training/training_images/sd2_incident.jpg differ diff --git a/doc/source/training/sd2_training/training_images/sd2_sla.jpg b/doc/source/training/sd2_training/training_images/sd2_sla.jpg new file mode 100755 index 0000000..0ebf2f1 Binary files /dev/null and b/doc/source/training/sd2_training/training_images/sd2_sla.jpg differ diff --git a/doc/source/training/sd2_training/workflow.rst b/doc/source/training/sd2_training/workflow.rst new file mode 100644 index 0000000..25ad8c8 --- /dev/null +++ b/doc/source/training/sd2_training/workflow.rst @@ -0,0 +1,30 @@ +.. _sd2_flow: + +SD2 Data Flow Process +===================== + + +.. image:: training_images/sd2_data_flow.svg + :target: training_images/sd2_data_flow.svg + :alt: sd2_data_flow + + +#. Service squad adds new data entries in GitHub repository for + EpMon (service URL queries), adjusting flag and health + metrics if required, and adds a service entry in the SD catalog. +#. Cloudmon fetches public configuration from GitHub and internal + configuration (credentials, certs, keys, ...) from a local + repository place to generate the final configuration. +#. EpMon plugin is executed and triggers HTTP requests as defined + by the configuration. +#. Metrics resulting by the HTTP requests are collected by Statsd. +#. Collected metrics are stored in the time series database Graphite. +#. The Metric Processor evaluates HTTP metrics from Graphite TSDB + and generates new flag and health metrics based on defined + rules and thresholds in configuration. +#. Status Dashboard changes service health semaphore based on the + resulting health metrics from the Metric Procesor. +#. Grafana uses metrics and statistics databases as the data + sources for the dashboards. The dashboard with various panels + shows the real-time status of the platform. Grafana supports + also historical views and trends. \ No newline at end of file diff --git a/tox.ini b/tox.ini index 5a5506d..c9c84b9 100644 --- a/tox.ini +++ b/tox.ini @@ -13,6 +13,8 @@ commands = stestr run {posargs} stestr slowest [testenv:pep8] +allowlist_externals = + doc8 commands = doc8 doc/source README.rst @@ -27,69 +29,3 @@ deps = commands = sphinx-build -W --keep-going -b html doc/source/ doc/build/html -[testenv:api-ref] -# This environment is called from CI scripts to test and publish -# the API Ref to docs.otc-service.com -deps = - -r{toxinidir}/requirements.txt -whitelist_externals = rm -commands = - rm -rf api-ref/build - sphinx-build -W -b html -d api-ref/build/doctrees api-ref/source api-ref/build/html - -[testenv:api-ref-pdf-docs] -deps = {[testenv:api-ref]deps} -envdir = {toxworkdir}/api-ref -whitelist_externals = - rm - make -commands = - rm -rf api-ref/build/pdf - sphinx-build -a -E -W -b latex api-ref/source api-ref/build/pdf - make -C api-ref/build/pdf - -[testenv:umn] -# This environment is called from CI scripts to test and publish -# the UMN to docs.otc-service.com -deps = - -r{toxinidir}/requirements.txt -whitelist_externals = rm -commands = - rm -rf umn/build - sphinx-build -W -b html -d umn/build/doctrees umn/source umn/build/html - -[testenv:umn-pdf-docs] -deps = {[testenv:umn]deps} -envdir = {toxworkdir}/umn -whitelist_externals = - rm - make - bash -commands = - rm -rf umn/build/pdf - sphinx-build -a -E -W -b latex umn/source umn/build/pdf - bash -c "for f in umn/build/pdf/*.gif; do convert $f $\{f/%gif/png\}; done || true" - bash -c "for f in umn/build/pdf/*.tex; do sed -iorig 's/\.gif//g' $f; done" - make -C umn/build/pdf - -[testenv:dev-guide] -# This environment is called from CI scripts to test and publish -# the Developer Guide to docs.otc-service.com -deps = - -r{toxinidir}/requirements.txt -whitelist_externals = rm -commands = - rm -rf dev_guide/build - sphinx-build -W -b html -d dev_guide/build/doctrees dev_guide/source dev_guide/build/html - -[testenv:dev-guide-pdf-docs] -deps = {[testenv:dev-guide]deps} -envdir = {toxworkdir}/dev_guide -whitelist_externals = - rm - make - sh -commands = - rm -rf dev_guide/build/pdf - sphinx-build -a -E -W -b latex dev_guide/source dev_guide/build/pdf - make -C dev_guide/build/pdf diff --git a/umn/requirements.txt b/umn/requirements.txt deleted file mode 100644 index 23b871f..0000000 --- a/umn/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -sphinx>=2.0.0,!=2.1.0 # BSD -otcdocstheme>=1.0.0 # Apache-2.0 -# releasenotes -reno>=3.1.0 # Apache-2.0 diff --git a/umn/source/conf.py b/umn/source/conf.py deleted file mode 100755 index 83d890b..0000000 --- a/umn/source/conf.py +++ /dev/null @@ -1,75 +0,0 @@ -# -*- coding: utf-8 -*- -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys - -sys.path.insert(0, os.path.abspath('../..')) -# -- General configuration ---------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = [ - 'sphinx.ext.autodoc', - 'otcdocstheme', -] - -# autodoc generation is a bit aggressive and a nuisance when doing heavy -# text edit cycles. -# execute "export SPHINX_DEBUG=1" in your terminal to disable - -# The suffix of source filenames. -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = 'internal-documentation' -copyright = '2022, Open Telekom Cloud Developers' - -# If true, '()' will be appended to :func: etc. cross-reference text. -add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -add_module_names = True - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'native' - -# -- Options for HTML output -------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. Major themes that come with -# Sphinx are currently 'default' and 'sphinxdoc'. -# html_theme_path = ["."] -# html_theme = '_theme' -# html_static_path = ['static'] -html_theme = 'otcdocs' - -# Output file base name for HTML help builder. -htmlhelp_basename = '%sdoc' % project - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass -# [howto/manual]). -latex_documents = [ - ('index', - '%s.tex' % project, - '%s Documentation' % project, - 'Open Telekom Cloud Developers', 'manual'), -] - -# Example configuration for intersphinx: refer to the Python standard library. -#intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/umn/source/index.rst b/umn/source/index.rst deleted file mode 100644 index 30a4f11..0000000 --- a/umn/source/index.rst +++ /dev/null @@ -1,3 +0,0 @@ -====================================================== -Welcome to the documentation of internal-documentation -======================================================