Carlos Aguni

Highly motivated self-taught IT analyst. Always learning and ready to explore new skills. An eternal apprentice.


Telemetry Study - Grafana Tempo Study

30 Jun 2021 »
  • https://dev.to/antongoncharov/distributed-logs-and-tracing-with-spring-apache-camel-opentelemetry-and-grafana-example-554e
  • https://www.datadoghq.com/blog/instrument-python-apps-with-datadog-and-opentelemetry/

Infra

docker-compose.yaml

version: '3'
services:
  tempo:
    image: grafana/tempo:latest
    volumes:
      - ./config.yaml:/config.yaml
    ports:
      - 9411:9411
      - 3100:3100
    command: ["-config.file", "/config.yaml"]
  grafana:
    image: grafana/grafana:8.0.4
    ports:
      - 3000:3000
  loki:
    image: grafana/loki:2.2.1
    ports:
      - "3101:3100"
    volumes:
      - ./loki-config.yaml:/etc/loki/local-config.yaml
    command: -config.file=/etc/loki/local-config.yaml

config.yaml

server:
  http_listen_port: 3100

distributor:
  receivers:
    zipkin:

storage:
  trace:
    backend: local
    local:
      path: /tmp/tempo/blocks

loki-config.yaml

auth_enabled: false

server:
  http_listen_port: 3100
  grpc_listen_port: 9096
  log_level: info


ingester:
  wal:
    enabled: true
    dir: /tmp/wal
  lifecycler:
    address: 127.0.0.1
    ring:
      kvstore:
        store: inmemory
      replication_factor: 1
    final_sleep: 0s
  chunk_idle_period: 1h       # Any chunk not receiving new logs in this time will be flushed
  max_chunk_age: 1h           # All chunks will be flushed when they hit this age, default is 1h
  chunk_target_size: 1048576  # Loki will attempt to build chunks up to 1.5MB, flushing first if chunk_idle_period or max_chunk_age is reached first
  chunk_retain_period: 30s    # Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m)
  max_transfer_retries: 0     # Chunk transfers disabled

schema_config:
  configs:
    - from: 2020-10-24
      store: boltdb-shipper
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 24h

storage_config:
  boltdb_shipper:
    active_index_directory: /tmp/loki/boltdb-shipper-active
    cache_location: /tmp/loki/boltdb-shipper-cache
    cache_ttl: 24h         # Can be increased for faster performance over longer query periods, uses more disk space
    shared_store: filesystem
  filesystem:
    directory: /tmp/loki/chunks

compactor:
  working_directory: /tmp/loki/boltdb-shipper-compactor
  shared_store: filesystem

limits_config:
  reject_old_samples: true
  reject_old_samples_max_age: 168h

chunk_store_config:
  max_look_back_period: 0s

table_manager:
  retention_deletes_enabled: false
  retention_period: 0s

ruler:
  storage:
    type: local
    local:
      directory: /tmp/loki/rules
  rule_path: /tmp/loki/rules-temp
  alertmanager_url: http://localhost:9093
  ring:
    kvstore:
      store: inmemory
  enable_api: true

Grafana Tempo

https://grafana.com/docs/tempo/latest/api_docs/pushing-spans-with-http/


[
    {
        "id": "1234",
        "traceId": "0123456789abcde1",
        "timestamp": 1625188800000000,
        "duration": 20000000,
        "name": "test",
        "tags": {
            "http.method": "GET",
            "http.path": "/api"
        },
        "localEndpoint": {
            "serviceName": "add syncdb"
        }
    },
    {
        "id": "5678",
        "traceId": "0123456789abcde1",
        "parentId": "1234",
        "timestamp": 1625188801000000,
        "duration": 5000000,
        "name": "async",
        "tags": {
            "http.method": "GET",
            "http.path": "/api",
            "params.func": "dfasdf",
            "params.diskname": "\\w0032423\\Backup",
            "error": "true"
        },
        "localEndpoint": {
            "serviceName": "add disk"
        }
    },
    {
        "id": "6789",
        "traceId": "0123456789abcde1",
        "parentId": "1234",
        "timestamp": 1625188806000000,
        "duration": 3000000,
        "name": "async",
        "tags": {
            "http.method": "GET",
            "http.path": "/api",
            "params.func": "dfasdf",
            "params.diskname": "\\w0032423\\Backup",
            "error": "true"
        },
        "localEndpoint": {
            "serviceName": "restore_db",
            "ipv4": "192.168.1.2",
            "port": 9411
        },
        "remoteEndpoint": {
            "serviceName": "spanstore-jdbc",
            "ipv4": "127.0.0.1",
            "port": 3306
        },
        "log": {
            "message": "dfasdfasdflsdlfjk asdkfasdfjasldflskdjf asdfa sdf"
        }
    }
]

[root@grafana-tempo tempo]# curl -X POST http://localhost:9411 -H 'Content-Type: application/json' -d '[{
>  "id": "1234",
>  "traceId": "0123456789abcde0",
>  "timestamp": 1625115600000000,
>  "duration": 200000,
>  "name": "span from bash!",
>  "tags": {
>     "http.method": "GET",
>     "http.path": "/api"
>   },
>   "localEndpoint": {
>     "serviceName": "shell script"
>   }
> }]'
[root@grafana-tempo tempo]# curl -X POST http://localhost:9411 -H 'Content-Type: application/json' -d '[{
>  "id": "5678",
>  "traceId": "0123456789abcde0",
>  "parentId": "1234",
>  "timestamp": 1625115610000000,
>  "duration": 150000,
>  "name": "child span from bash!",
>   "localEndpoint": {
>     "serviceName": "shell script"
>   }
> }]'
[root@grafana-tempo tempo]# curl -X POST http://localhost:9411 -H 'Content-Type: application/json' -d '[{
>  "id": "6789",
>  "traceId": "0123456789abcde0",
>  "parentId": "5678",
>  "timestamp": 1625115610100000,
>  "duration": 350000,
>  "name": "child span from bash!",
>   "localEndpoint": {
>     "serviceName": "shell script"
>   }
> }]'

OpenTracing

https://github.com/opentracing/specification/blob/master/semantic_conventions.md

Python Example

Jaeger

from opentelemetry import trace
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor

trace.set_tracer_provider(
    TracerProvider(
        resource=Resource.create({SERVICE_NAME: "my-helloworld-service"})
    )
)

jaeger_exporter = JaegerExporter(
    agent_host_name="localhost",
    agent_port=6831,
)

trace.get_tracer_provider().add_span_processor(
    BatchSpanProcessor(jaeger_exporter)
)

tracer = trace.get_tracer(__name__)

with tracer.start_as_current_span("foo"):
    with tracer.start_as_current_span("bar"):
        with tracer.start_as_current_span("baz"):
            print("Hello world from OpenTelemetry Python!")

{
    "name": "baz",
    "context": {
        "trace_id": "0x8342e0dad3853eb922985cbcd95e3362",
        "span_id": "0x4e93b924c39c4313",
        "trace_state": "[]"
    },
    "kind": "SpanKind.INTERNAL",
    "parent_id": "0x3f734c1712c2a36f",
    "start_time": "2021-07-07T23:57:22.572984Z",
    "end_time": "2021-07-07T23:57:22.573022Z",
    "status": {
        "status_code": "UNSET"
    },
    "attributes": {},
    "events": [],
    "links": [],
    "resource": {
        "telemetry.sdk.language": "python",
        "telemetry.sdk.name": "opentelemetry",
        "telemetry.sdk.version": "1.3.0",
        "service.name": "my-helloworld-service2"
    }
}
{
    "name": "bar",
    "context": {
        "trace_id": "0x8342e0dad3853eb922985cbcd95e3362",
        "span_id": "0x3f734c1712c2a36f",
        "trace_state": "[]"
    },
    "kind": "SpanKind.INTERNAL",
    "parent_id": "0xd302a32ab0c215b3",
    "start_time": "2021-07-07T23:57:22.572933Z",
    "end_time": "2021-07-07T23:57:22.573243Z",
    "status": {
        "status_code": "UNSET"
    },
    "attributes": {},
    "events": [],
    "links": [],
    "resource": {
        "telemetry.sdk.language": "python",
        "telemetry.sdk.name": "opentelemetry",
        "telemetry.sdk.version": "1.3.0",
        "service.name": "my-helloworld-service2"
    }
}
{
    "name": "foo2",
    "context": {
        "trace_id": "0x8342e0dad3853eb922985cbcd95e3362",
        "span_id": "0xd302a32ab0c215b3",
        "trace_state": "[]"
    },
    "kind": "SpanKind.INTERNAL",
    "parent_id": null,
    "start_time": "2021-07-07T23:57:22.572830Z",
    "end_time": "2021-07-07T23:57:22.573358Z",
    "status": {
        "status_code": "UNSET"
    },
    "attributes": {},
    "events": [],
    "links": [],
    "resource": {
        "telemetry.sdk.language": "python",
        "telemetry.sdk.name": "opentelemetry",
        "telemetry.sdk.version": "1.3.0",
        "service.name": "my-helloworld-service2"
    }
}

Zipkin

from opentelemetry import trace
#from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.exporter.zipkin.json import ZipkinExporter
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import (
    BatchSpanProcessor,
    ConsoleSpanExporter,
    SimpleSpanProcessor,
)

trace.set_tracer_provider(
    TracerProvider(
        resource=Resource.create({SERVICE_NAME: "my-helloworld-service"})
    )
)

zipkin_exporter = ZipkinExporter(                     
    # version=Protocol.V2                             
    # optional:                                       
    # endpoint="http://localhost:9411/api/v2/spans",  
    # local_node_ipv4="192.168.0.1",                  
    # local_node_ipv6="2001:db8::c001",               
    # local_node_port=31313,                          
    # max_tag_value_length=256                        
    # timeout=5 (in seconds)                          
    endpoint="http://grafana-tempo:9411/api/v2/spans",
)                                                     


trace.get_tracer_provider().add_span_processor(
    BatchSpanProcessor(zipkin_exporter)
)
trace.get_tracer_provider().add_span_processor(
    BatchSpanProcessor(ConsoleSpanExporter)
    #SimpleSpanProcessor(ConsoleSpanExporter)
)

tracer = trace.get_tracer(__name__)

with tracer.start_as_current_span("foo"):
    with tracer.start_as_current_span("bar"):
        with tracer.start_as_current_span("baz"):
            print("Hello world from OpenTelemetry Python!")

Zipkin

Instrumenting

Annotation

An Annotation is used to record an occurrence in time. There’s a set of core annotations used to define the beginning and end of an RPC request:

  • cs - Client Send. The client has made the request. This sets the beginning of the span.
  • sr - Server Receive: The server has received the request and will start processing it. The difference between this and cs will be combination of network latency and clock jitter.
  • ss - Server Send: The server has completed processing and has sent the request back to the client. The difference between this and sr will be the amount of time it took the server to process the request.
  • cr - Client Receive: The client has received the response from the server. This sets the end of the span. The RPC is considered complete when this annotation is recorded.

When using message brokers instead of RPCs, the following annotations help clarify the direction of the flow:

  • ms - Message Send: The producer sends a message to a broker.
  • mr - Message Receive: A consumer received a message from a broker.

Data Model

https://zipkin.io/pages/data_model.html

 [
    {
      "traceId": "5982fe77008310cc80f1da5e10147517",
      "name": "get",
      "id": "bd7a977555f6b982",
      "timestamp": 1458702548467000,
      "duration": 386000,
      "localEndpoint": {
        "serviceName": "zipkin-query",
        "ipv4": "192.168.1.2",
        "port": 9411
      },
      "annotations": [
        {
          "timestamp": 1458702548467000,
          "value": "sr"
        },
        {
          "timestamp": 1458702548853000,
          "value": "ss"
        }
      ]
    },
    {
      "traceId": "5982fe77008310cc80f1da5e10147517",
      "name": "get-traces",
      "id": "ebf33e1a81dc6f71",
      "parentId": "bd7a977555f6b982",
      "timestamp": 1458702548478000,
      "duration": 354374,
      "localEndpoint": {
        "serviceName": "zipkin-query",
        "ipv4": "192.168.1.2",
        "port": 9411
      },
      "tags": {
        "lc": "JDBCSpanStore",
        "request": "QueryRequest{serviceName=zipkin-query, spanName=null, annotations=[], binaryAnnotations={}, minDuration=null, maxDuration=null, endTs=1458702548478, lookback=86400000, limit=1}"
      }
    },
    {
      "traceId": "5982fe77008310cc80f1da5e10147517",
      "name": "query",
      "id": "be2d01e33cc78d97",
      "parentId": "ebf33e1a81dc6f71",
      "timestamp": 1458702548786000,
      "duration": 13000,
      "localEndpoint": {
        "serviceName": "zipkin-query",
        "ipv4": "192.168.1.2",
        "port": 9411
      },
      "remoteEndpoint": {
        "serviceName": "spanstore-jdbc",
        "ipv4": "127.0.0.1",
        "port": 3306
      },
      "annotations": [
        {
          "timestamp": 1458702548786000,
          "value": "cs"
        },
        {
          "timestamp": 1458702548799000,
          "value": "cr"
        }
      ],
      "tags": {
        "jdbc.query": "select distinct `zipkin_spans`.`trace_id` from `zipkin_spans` join `zipkin_annotations` on (`zipkin_spans`.`trace_id` = `zipkin_annotations`.`trace_id` and `zipkin_spans`.`id` = `zipkin_annotations`.`span_id`) where (`zipkin_annotations`.`endpoint_service_name` = ? and `zipkin_spans`.`start_ts` between ? and ?) order by `zipkin_spans`.`start_ts` desc limit ?",
        "sa": "true"
      }
    },
    {
      "traceId": "5982fe77008310cc80f1da5e10147517",
      "name": "query",
      "id": "13038c5fee5a2f2e",
      "parentId": "ebf33e1a81dc6f71",
      "timestamp": 1458702548817000,
      "duration": 1000,
      "localEndpoint": {
        "serviceName": "zipkin-query",
        "ipv4": "192.168.1.2",
        "port": 9411
      },
      "remoteEndpoint": {
        "serviceName": "spanstore-jdbc",
        "ipv4": "127.0.0.1",
        "port": 3306
      },
      "annotations": [
        {
          "timestamp": 1458702548817000,
          "value": "cs"
        },
        {
          "timestamp": 1458702548818000,
          "value": "cr"
        }
      ],
      "tags": {
        "jdbc.query": "select `zipkin_spans`.`trace_id`, `zipkin_spans`.`id`, `zipkin_spans`.`name`, `zipkin_spans`.`parent_id`, `zipkin_spans`.`debug`, `zipkin_spans`.`start_ts`, `zipkin_spans`.`duration` from `zipkin_spans` where `zipkin_spans`.`trace_id` in (?)",
        "sa": "true"
      }
    },
    {
      "traceId": "5982fe77008310cc80f1da5e10147517",
      "name": "query",
      "id": "37ee55f3d3a94336",
      "parentId": "ebf33e1a81dc6f71",
      "timestamp": 1458702548827000,
      "duration": 2000,
      "localEndpoint": {
        "serviceName": "zipkin-query",
        "ipv4": "192.168.1.2",
        "port": 9411
      },
      "remoteEndpoint": {
        "serviceName": "spanstore-jdbc",
        "ipv4": "127.0.0.1",
        "port": 3306
      },
      "annotations": [
        {
          "timestamp": 1458702548827000,
          "value": "cs"
        },
        {
          "timestamp": 1458702548829000,
          "value": "cr"
        }
      ],
      "tags": {
        "jdbc.query": "select `zipkin_annotations`.`trace_id`, `zipkin_annotations`.`span_id`, `zipkin_annotations`.`a_key`, `zipkin_annotations`.`a_value`, `zipkin_annotations`.`a_type`, `zipkin_annotations`.`a_timestamp`, `zipkin_annotations`.`endpoint_ipv4`, `zipkin_annotations`.`endpoint_port`, `zipkin_annotations`.`endpoint_service_name` from `zipkin_annotations` where `zipkin_annotations`.`trace_id` in (?) order by `zipkin_annotations`.`a_timestamp` asc, `zipkin_annotations`.`a_key` asc",
        "sa": "true"
      }
    }
  ]