Dependency | Reason |
---|---|
Task Instance State | Task is in the 'failed' state which is not a valid state for execution. The task must be cleared in order to be run. |
Dagrun Running | Task instance's dagrun was not in the 'running' state but in the state 'failed'. |
Execution Date | The execution date is 2022-05-06T02:00:00+00:00 but this is before the task's start date 2024-09-20T20:00:00+00:00. |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | def check_input_persisted_in_s3(**kwargs):
"""
Fetch differnt versions inside prefix
And check if path created for input hour.
If present, mark step as success. if not, raise exception for input data unavailability.
"""
exec_datehour = datetime.strptime(kwargs['ts_nodash'], '%Y%m%dT%H%M%S')
validation_configs = kwargs['input-validation-step-config']
step_conf = Variable.get(validation_configs, deserialize_json=True)
input_bucket = step_conf['input_bucket']
key_prefix = step_conf['key_prefix']
input_hours_to_skip_for_alert = step_conf['expected_hours_to_skip_for_alert'].split(",")
aws_region = Variable.get('aws_region').lower()
s3_client = boto3.client('s3', aws_region)
versions_list = list_folders(s3_client, input_bucket, key_prefix)
input_path_versions_datehour = list(
map(lambda str: str + f"{exec_datehour.date()}/"f"{exec_datehour.hour:02d}", versions_list))
paths_present = []
for prefix in input_path_versions_datehour:
try:
resp = s3_client.list_objects(Bucket=input_bucket, Prefix=prefix, Delimiter='/', MaxKeys=1)
if 'CommonPrefixes' in resp:
paths_present.append(prefix)
except ClientError as exc:
LOG.info(f"Path not found for {prefix}: ", exc)
if '{:02d}'.format(exec_datehour.hour) not in input_hours_to_skip_for_alert:
if len(paths_present) > 0:
LOG.info(f"Input path exists for the hour {exec_datehour} : {paths_present}")
else:
raise Exception(f"Input path does not exist for the hour: {exec_datehour} in s3://{input_bucket}/{key_prefix}")
else:
LOG.info(f"Skipped check for the hour {exec_datehour}")
|
Attribute | Value |
---|---|
dag_id | livepop-shuffle-aggregator-modeltransform_v004 |
duration | 0.540278 |
end_date | 2022-05-06 03:21:18.093891+00:00 |
execution_date | 2022-05-06T02:00:00+00:00 |
executor_config | {} |
generate_command | <function TaskInstance.generate_command at 0x7f603554be18> |
hostname | airflow-worker-0.airflow-worker.bigdata.svc.cluster.local |
is_premature | False |
job_id | 179959 |
key | ('livepop-shuffle-aggregator-modeltransform_v004', 'check_input_available', <Pendulum [2022-05-06T02:00:00+00:00]>, 6) |
log | <Logger airflow.task (INFO)> |
log_filepath | /opt/airflow/logs/livepop-shuffle-aggregator-modeltransform_v004/check_input_available/2022-05-06T02:00:00+00:00.log |
log_url | https://airflow.devel.viooh.net.cn/admin/airflow/log?execution_date=2022-05-06T02%3A00%3A00%2B00%3A00&task_id=check_input_available&dag_id=livepop-shuffle-aggregator-modeltransform_v004 |
logger | <Logger airflow.task (INFO)> |
mark_success_url | https://airflow.devel.viooh.net.cn/success?task_id=check_input_available&dag_id=livepop-shuffle-aggregator-modeltransform_v004&execution_date=2022-05-06T02%3A00%3A00%2B00%3A00&upstream=false&downstream=false |
max_tries | 4 |
metadata | MetaData(bind=None) |
next_try_number | 6 |
operator | PythonOperator |
pid | 2521 |
pool | default_pool |
pool_slots | 1 |
prev_attempted_tries | 5 |
previous_execution_date_success | 2022-05-05 20:00:00+00:00 |
previous_start_date_success | 2022-05-05 21:00:06.355451+00:00 |
previous_ti | <TaskInstance: livepop-shuffle-aggregator-modeltransform_v004.check_input_available 2022-05-06 01:00:00+00:00 [failed]> |
previous_ti_success | <TaskInstance: livepop-shuffle-aggregator-modeltransform_v004.check_input_available 2022-05-05 20:00:00+00:00 [success]> |
priority_weight | 4 |
queue | default |
queued_dttm | 2022-05-06 03:21:15.210929+00:00 |
raw | False |
run_as_user | None |
start_date | 2022-05-06 03:21:17.553613+00:00 |
state | failed |
task | <Task(PythonOperator): check_input_available> |
task_id | check_input_available |
test_mode | False |
try_number | 6 |
unixname | airflow |
Attribute | Value |
---|---|
dag | <DAG: livepop-shuffle-aggregator-modeltransform_v004> |
dag_id | livepop-shuffle-aggregator-modeltransform_v004 |
depends_on_past | False |
deps | {<TIDep(Not Previously Skipped)>, <TIDep(Trigger Rule)>, <TIDep(Previous Dagrun State)>, <TIDep(Not In Retry Period)>} |
do_xcom_push | True |
downstream_list | [<Task(PythonOperator): create_emr_steps>] |
downstream_task_ids | {'create_emr_steps'} |
None | |
email_on_failure | True |
email_on_retry | True |
end_date | None |
execution_timeout | None |
executor_config | {} |
extra_links | [] |
global_operator_extra_link_dict | {} |
inlets | [] |
lineage_data | None |
log | <Logger airflow.task.operators (INFO)> |
logger | <Logger airflow.task.operators (INFO)> |
max_retry_delay | None |
on_failure_callback | <function task_fail_slack_alert at 0x7f602bc8d158> |
on_retry_callback | None |
on_success_callback | None |
op_args | [] |
op_kwargs | {'master-instance-types': 'm5.xlarge,m5.2xlarge', 'core-instance-types': 'm5.xlarge,m5.2xlarge', 'task-instance-types': 'm5.xlarge,m5.2xlarge', 'core-instance-capacity': 3, 'task-instance-capacity': 0, 'ebs-volume-size': '50', 'emr-version': 'emr-6.14.0', 'input-validation-step-config': 'livepop_shuffle_job_validation_config', 'emr-steps': '[\n {\n "step-name": "LivePOPShuffle",\n "config-json": [\n {"spark.driver.memory":"9g"},\n {"spark.serializer":"org.apache.spark.serializer.KryoSerializer"}\n ],\n "main-class": "com.viooh.pop.data.live.shuffle.RawLivePOPShuffleMain",\n "group-id":"com/viooh/pop",\n "artifact": "pop-shuffle-live",\n "jars": "/usr/lib/spark/connector/lib/spark-avro.jar",\n "files": "s3://viooh-spark-artifacts-lab-cn/metrics/batch-job-metrics/0.0.0/job.conf,s3://viooh-spark-artifacts-lab-cn/metrics/batch-job-metrics/0.0.0/job.yaml"\n },\n {\n "step-name": "PopAggregator",\n "config-json": [\n {"spark.driver.memory":"9g"},\n {"spark.serializer":"org.apache.spark.serializer.KryoSerializer"}\n ],\n "main-class": "com.viooh.pop.aggregator.livepop.LivePOPAggregatorMain",\n "group-id":"com/viooh/pop",\n "artifact": "pop-data-aggregator",\n "files": "s3://viooh-spark-artifacts-lab-cn/metrics/batch-job-metrics/0.0.0/job.conf,s3://viooh-spark-artifacts-lab-cn/metrics/batch-job-metrics/0.0.0/job.yaml"\n },\n {\n "step-name": "ModelTransform",\n "config-json": [\n {"spark.driver.memory":"9g"}\n ],\n "main-class": "uk.co.viooh.job.modeltransform.ModelTransform",\n "group-id": "uk/co/viooh",\n "artifact": "pandora-model-transform",\n "files": "s3://viooh-spark-artifacts-lab-cn/metrics/batch-job-metrics/0.0.0/job.conf,s3://viooh-spark-artifacts-lab-cn/metrics/batch-job-metrics/0.0.0/job.yaml"\n }\n]', 'cluster-name': 'livepop-shuffle-aggregator-modeltransform', 'dag-id': 'livepop-shuffle-aggregator-modeltransform_v004', 'schedule_interval': '0 * * * *', 'trigger_dags': []} |
operator_extra_link_dict | {} |
operator_extra_links | () |
outlets | [] |
owner | data.engineers@viooh.com |
params | {} |
pool | default_pool |
pool_slots | 1 |
priority_weight | 1 |
priority_weight_total | 4 |
provide_context | True |
queue | default |
resources | None |
retries | 4 |
retry_delay | 0:05:00 |
retry_exponential_backoff | False |
run_as_user | None |
schedule_interval | 0 * * * * |
shallow_copy_attrs | ('python_callable', 'op_kwargs') |
sla | None |
start_date | 2024-09-20 20:00:00+00:00 |
subdag | None |
task_concurrency | None |
task_id | check_input_available |
task_type | PythonOperator |
template_ext | [] |
template_fields | ('templates_dict', 'op_args', 'op_kwargs') |
templates_dict | None |
trigger_rule | all_success |
ui_color | #ffefeb |
ui_fgcolor | #000 |
upstream_list | [] |
upstream_task_ids | set() |
wait_for_downstream | False |
weight_rule | downstream |