Source code for ibmcloudsql.sql_query_ts

# ------------------------------------------------------------------------------
# Copyright IBM Corp. 2020
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ------------------------------------------------------------------------------
# flake8: noqa F522, E203
import re
import threading

from isodate import ISO8601Error

try:
    from SQLQuery import SQLQuery
    from sql_magic import print_sql
except Exception:
    from .SQLQuery import SQLQuery
    from .sql_magic import print_sql

lock = threading.Lock()


[docs]class SQLClientTimeSeries(SQLQuery):
    """This class augments SQLClient with time-series functionality

    HISTORY:
    Aug-10-2021: expand _get_ts_datasource_v3()
    """

    def __init__(
        self,
        api_key="",
        instance_crn="",
        target_cos_url=None,
        max_concurrent_jobs=4,
        max_tries=1,
        iam_max_tries=1,
        thread_safe=False,
        client_info="TimeSeries Cloud SQL Query Python",
    ):
        SQLQuery.__init__(
            self,
            api_key=api_key,
            instance_crn=instance_crn,
            target_cos_url=target_cos_url,
            max_concurrent_jobs=max_concurrent_jobs,
            max_tries=max_tries,
            iam_max_tries=iam_max_tries,
            thread_safe=thread_safe,
            client_info=client_info,
        )

    def _get_ts_datasource(
        self,
        table_name,
        key,
        time_stamp,
        observation,
        cos_out,
        granularity="raw",
        where_clause="",
        ops="avg",
        dry_run=False,
        keep_col_names: bool = True,
        cast_observation=None,
        num_objects=None,
        num_rows=None,
    ):
        """
        Prepare the data source for time-series in the next-query

        It will returns the data source in 3 columns: field_name, time_stamp, observation

        Parameters
        --------------
        table: str
            The catalog table name or the view-table that you generate from the WITH clause via :meth:`with_` method
        key: str
            The column name being used as the key, and is maped to `field_name`
        time_stamp: str
            The column name being used as timetick, and is mapped to `time_stamp`
        observation: str
            The column name being used as value, and is maped to `observation`
        cos_out: str
            The COS URL where the data is copied to - later as data source
        granularity: str
            There are 2 options:

            * a value in one of ["raw", "per_min", "per_<x>min", "per_sec", "per_<x>sec", "per_hour", "per_<x>hour"]
            with <x> is a number divided by 60, e.g. 10, 15
            * a value of "per_day"

            * a value that follows ISO 8601 'duration', e.g. PT1M, PT1S, PT2H
        ops: str
            The aggregation method: "avg", "sum", "max", "min", "count"
        dry_run: bool, optional
            This option, once selected as True, returns the internally generated SQL statement, and no job is queried.
        num_objects: None or int
            The number of objects to be created for storing the data. Using `num_objects` and `num_rows` are exclusive.
        num_rows: None or int
            The number of rows for each object to be created for storing the data. Using `num_objects` and `num_rows` are exclusive.
        keep_col_names: bool, optional (False)
            By default, all 3 original column names are maintained.
            If you set to false, they are mapped to `field_name` (for key),
            `time_stamp` and `observation`, respectively.
        cast_observation: str, optional=None
            The type to be casted for the `observation` column

        Returns
        ----------
        str
            The COS_URL where the data with 3 fields (key, time_stamp, observation)
            and can be digested into time-series via TIME_SERIES_FORMAT(key, timestick, value)

        Raises
        ------
        isodate.ISO8601Error:
            granularity input is not a valid ISO 8601-compliant value

        """
        tmp_cos = self.target_cos_url
        if num_objects is None and num_rows is None:
            print("provide at least `num_objects` or `num_rows`")
            assert 0
        if num_objects is not None and num_rows is not None:
            print("can't use both `num_objects` and `num_rows`")
            assert 0

        if granularity == "raw":
            pass
        elif granularity == "per_min":
            pass
        if time_stamp in self._unixtime_columns:
            #  -- convert unix-time in ms to sec
            time_info = "cast({time_stamp}/1000 as timestamp)".format(
                time_stamp=time_stamp
            )
        else:
            time_info = "{time_stamp}".format(time_stamp=time_stamp)
        if keep_col_names is True:
            key_col = "field_name as {key}".format(key=key)
            obs_col = "{observation}".format(observation=observation)
        else:
            key_col = "field_name, -- as {key}".format(key=key)
            obs_col = "{observation} as observation".format(observation=observation)

        if granularity == "raw":
            tmp_cos = cos_out

        if len(where_clause) == 0:
            extra_where = ""
        else:
            extra_where = "AND " + where_clause

        if num_objects:
            partition_clause = "PARTITIONED INTO {x} OBJECTS".format(x=num_objects)
        elif num_rows:
            partition_clause = "PARTITIONED EVERY {x} ROWS".format(x=num_rows)
        if cast_observation is None:
            select_stmt = """SELECT {key} as field_name,
                    {time_info} as time_stamp,
                    {observation} as observation
                FROM {table_name}
                WHERE isnotnull({key}) AND isnotnull({observation})
                {extra_where}
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                table_name=table_name,
                cos_out=tmp_cos,
                key=key,
                observation=observation,
                time_info=time_info,
                extra_where=extra_where,
                partition_clause=partition_clause,
            )
        else:
            select_stmt = """SELECT {key} as field_name,
                    {time_info} as time_stamp,
                    cast({observation} AS {cast_observation}) as observation
                FROM {table_name}
                WHERE isnotnull({key}) AND isnotnull({observation})
                {extra_where}
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                table_name=table_name,
                cos_out=tmp_cos,
                key=key,
                observation=observation,
                time_info=time_info,
                extra_where=extra_where,
                cast_observation=cast_observation,
                partition_clause=partition_clause,
            )
        if self._has_with_clause and not self._has_select_clause:
            self._sql_stmt = self._sql_stmt + select_stmt
        else:
            self._sql_stmt = select_stmt
        result = None
        if dry_run:
            print_sql(self._sql_stmt)
            select_container_id_full_location = "cos://dry_run/"
        else:
            result = self.execute_sql(self._sql_stmt)
            select_container_id_full_location = self.get_job(result.job_id)[
                "resultset_location"
            ]

        cos_in = ""
        if granularity == "raw":
            return result
        else:
            cos_in = select_container_id_full_location

        def query_day():
            """
            return:
                None or 'result'
            """
            nonlocal cos_in

            p = re.compile(r"per_[0-9]*day")
            p2 = re.compile(r"per_[0-9]+day")
            level = "raw"
            num_day = 1
            if p.match(granularity):
                level = "day"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_day = list(map(int, temp))[0]
                    assert 1 % num_day == 0
            else:
                if granularity[0:1].upper() == "P" and granularity[-1].upper() == "D":
                    level = "day"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_day = int(x.total_seconds() / 60 / 60 / 24)  # (day)
                        assert 1 % num_day == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "day":
                if num_day == 1:
                    sql_stmt = """
                    select
                        field_name,
                        to_date(time_stamp) as time_stamp_date,
                        {ops}(observation) as {observation}
                    from {cos_in} stored as parquet
                    group by field_name,  time_stamp_date
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=tmp_cos,
                        key=key,
                        observation=observation,
                        time_info=time_info,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                else:  # num_hour > 1
                    msg = "Can't use a day value > 1"
                    raise Exception(msg)
                    assert 0
                    sql_stmt = """
                    select
                        field_name,
                        to_date(time_stamp) as time_stamp_date,
                        (floor(hour(time_stamp)/{num_hour})) * {num_hour} as time_stamp_hour, -- within [current, current+{num_hour}) hour time-window
                        {ops}(observation) as {observation}
                    from {cos_in} stored as parquet
                    group by field_name,  time_stamp_date, time_stamp_hour
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=tmp_cos,
                        key=key,
                        observation=observation,
                        time_info=time_info,
                        num_hour=num_hour,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                # Restore timestamp as single column
                if dry_run:
                    cos_in = "cos://dry_run/"
                else:
                    cos_in = self.get_job(result.job_id)["resultset_location"]
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", "00:00"), "yyyy-MM-dd HH:mm") as {time_stamp}""".format(
                        time_stamp=time_stamp
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", "00:00"), "yyyy-MM-dd HH:mm") as time_stamp"""
                sql_stmt = """
                select
                    {key_col},
                    {datetime_col},
                    {obs_col}
                from {cos_in} stored as parquet
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=cos_out,
                    key_col=key_col,
                    datetime_col=datetime_col,
                    obs_col=obs_col,
                    time_info=time_info,
                    partition_clause=partition_clause,
                )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)
            return result

        def query_hour():
            """
            return:
                None or 'result'
            """
            nonlocal cos_in

            p = re.compile(r"per_[0-9]*hour")
            p2 = re.compile(r"per_[0-9]+hour")
            level = "raw"
            num_hour = 1
            if p.match(granularity):
                level = "hour"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_hour = list(map(int, temp))[0]
                    assert 24 % num_hour == 0
            else:
                if granularity[0:2].upper() == "PT" and granularity[-1].upper() == "H":
                    level = "hour"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_hour = int(x.total_seconds() / 60 / 60)  # (hour)
                        assert 24 % num_hour == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "hour":
                if num_hour == 1:
                    sql_stmt = """
                    select
                        field_name,
                        to_date(time_stamp) as time_stamp_date,
                        hour(time_stamp) as time_stamp_hour,
                        {ops}(observation) as {observation}
                    from {cos_in} stored as parquet
                    group by field_name,  time_stamp_date, time_stamp_hour
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=tmp_cos,
                        key=key,
                        observation=observation,
                        time_info=time_info,
                        num_hour=num_hour,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                else:  # num_hour > 1
                    sql_stmt = """
                    select
                        field_name,
                        to_date(time_stamp) as time_stamp_date,
                        (floor(hour(time_stamp)/{num_hour})) * {num_hour} as time_stamp_hour, -- within [current, current+{num_hour}) hour time-window
                        {ops}(observation) as {observation}
                    from {cos_in} stored as parquet
                    group by field_name,  time_stamp_date, time_stamp_hour
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=tmp_cos,
                        key=key,
                        observation=observation,
                        time_info=time_info,
                        num_hour=num_hour,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                # Restore timestamp as single column
                if dry_run:
                    cos_in = "cos://dry_run/"
                else:
                    cos_in = self.get_job(result.job_id)["resultset_location"]
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", time_stamp_hour, ":00"), "yyyy-MM-dd HH:mm") as {time_stamp}""".format(
                        time_stamp=time_stamp
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", time_stamp_hour, ":00"), "yyyy-MM-dd HH:mm") as time_stamp"""
                sql_stmt = """
                select
                    {key_col},
                    {datetime_col},
                    {obs_col}
                from {cos_in} stored as parquet
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=cos_out,
                    key_col=key_col,
                    obs_col=obs_col,
                    datetime_col=datetime_col,
                    time_info=time_info,
                    partition_clause=partition_clause,
                )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)
            return result

        def query_min():
            """
            return:
                None or 'result'
            """
            nonlocal cos_in

            p = re.compile(r"per_[0-9]*min")
            p2 = re.compile(r"per_[0-9]+min")
            level = "raw"
            num_min = 1
            if p.match(granularity):
                level = "minute"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_min = list(map(int, temp))[0]
                    assert 60 % num_min == 0
            else:
                if granularity[0:2].upper() == "PT" and granularity[-1].upper() == "M":
                    level = "minute"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_min = int(x.total_seconds() / 60)  # (minute)
                        assert 60 % num_min == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "minute":
                sql_stmt = """
                select
                    field_name,
                    to_date(time_stamp) as time_stamp_date,
                    hour(time_stamp) as time_stamp_hour,
                    (floor(minute(time_stamp)/{num_min})) * {num_min} as time_stamp_minute, -- within [current, current+{num_min}) minute time-window
                    {ops}(observation) as {observation}
                from {cos_in} stored as parquet
                group by field_name,  to_date(time_stamp), hour(time_stamp), (floor(minute(time_stamp)/{num_min})) * {num_min}
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=tmp_cos,
                    key=key,
                    observation=observation,
                    time_info=time_info,
                    num_min=num_min,
                    partition_clause=partition_clause,
                    ops=ops,
                )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                # Restore timestamp as single column
                if dry_run:
                    cos_in = "cos://dry_run/"
                else:
                    cos_in = self.get_job(result.job_id)["resultset_location"]
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", time_stamp_hour, ":", time_stamp_minute), "yyyy-MM-dd HH:mm") as {time_stamp}""".format(
                        time_stamp=time_stamp
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", time_stamp_hour, ":", time_stamp_minute), "yyyy-MM-dd HH:mm") as time_stamp"""
                sql_stmt = """
                select
                    {key_col},
                    {datetime_col},
                    {obs_col}
                from {cos_in} stored as parquet
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=cos_out,
                    key_col=key_col,
                    obs_col=obs_col,
                    datetime_col=datetime_col,
                    time_info=time_info,
                    partition_clause=partition_clause,
                )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)
            return result

        def query_sec():
            """
            return:
                None or 'result'
            """

            p = re.compile(r"per_[0-9]*sec")
            p2 = re.compile(r"per_[0-9]+sec")
            level = "raw"
            num_sec = 1
            if p.match(granularity):
                level = "sec"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_sec = list(map(int, temp))[0]
                    assert 60 % num_sec == 0
            else:
                if granularity[0:2].upper() == "PT" and granularity[-1].upper() == "S":
                    level = "sec"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_sec = int(x.total_seconds())
                        assert 60 % num_sec == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "sec":
                sql_stmt = """
                select
                    field_name,
                    to_date(time_stamp) as time_stamp_date,
                    hour(time_stamp) as time_stamp_hour,
                    minute(time_stamp) as time_stamp_minute,
                    (floor(second(time_stamp)/{num_sec})) * {num_sec} as time_stamp_second, -- within [current, current+{num_sec}) second time-window
                    {ops}(observation) as {observation}
                from {cos_in} stored as parquet
                group by field_name,  to_date(time_stamp), hour(time_stamp), minute(time_stamp), (floor(second(time_stamp)/{num_sec})) * {num_sec}
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=tmp_cos,
                    key=key,
                    observation=observation,
                    time_info=time_info,
                    num_sec=num_sec,
                    partition_clause=partition_clause,
                    ops=ops,
                )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                # Restore timestamp as single column
                if dry_run:
                    init_cos = "cos://dry_run/"
                else:
                    init_cos = self.get_job(result.job_id)["resultset_location"]
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", ":".join([time_stamp_hour, time_stamp_minute, time_stamp_second]), "yyyy-MM-dd HH:mm:ss") as {time_stamp}""".format(
                        time_stamp=time_stamp
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", ":".join([time_stamp_hour, time_stamp_minute, time_stamp_second]), "yyyy-MM-dd HH:mm:ss") as time_stamp"""
                sql_stmt = """
                select
                    {key_col},
                    {datetime_col},
                    {obs_col}
                from {cos_in} stored as parquet
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=cos_out,
                    key_col=key_col,
                    obs_col=obs_col,
                    datetime_col=datetime_col,
                    time_info=time_info,
                    partition_clause=partition_clause,
                )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)
            return result

        result = query_min()
        if result is None:
            result = query_sec()
        if result is None:
            result = query_hour()
        if result is None:
            result = query_day()
        return result

[docs]    def get_ts_datasource(
        self,
        cos_in,
        key,
        time_stamp,
        observation,
        cos_out,
        granularity="raw",
        where_clause="",
        ops="avg",
        dry_run=False,
        keep_col_names: bool = True,
        cast_observation=None,
        num_objects=20,
        print_warning=True,
    ):
        """
        Prepare the data source for time-series in the next-query, in that the result is
        broken down into multiple objects using `num_objects` as the criteria

        It will returns the data source in 3 columns:
            * keep_col_names <- False: use exact values below
                field_name, time_stamp, observation
            * keep_col_names <- True: use the values passed via arguments
                <key>, <time_stamp>, <observation>

        Parameters
        --------------
        cos_in: str
            The data source in "COS_URL stored as <format>"
        key: str
            The column name being used as the key
        time_stamp: str
            The column name being used as timestick
        observation: str
            The column name being used as value
        cast_observation: str, optional=None
            The type to be casted for the `observation` column
        cos_out: str
            The COS URL where the data is copied to - later as data source
        granularity: str
            a value in one of ["raw", "per_min", "per_<x>min", "per_sec", "per_<x>sec"]
            with <x> is a number divided by 60, e.g. 10, 15
        dry_run: bool, optional
            This option, once selected as True, returns the internally generated SQL statement, and no job is queried.
        num_objects: int, optional
            The number of objects to be created for storing the data
        print_warning: bool, default=True
            print a warning or not
        keep_col_names: bool, optional (False)
            By default, all 3 original column names are maintained.
            If you set to false, they are mapped to `field_name` (for key),
            `time_stamp` and `observation`, respectively.

        Returns
        ----------
        str
            The COS_URL where the data with 3 fields (key, time_stamp, observation)
            and can be digested into time-series via TIME_SERIES_FORMAT(key, timestick, value)
        """
        if len(self._unixtime_columns) == 0 and print_warning is True:
            msg = (
                "WARNING: You may want to assign the list of columns to`.columns_in_unixtime`",
                " to let the SQL client know what columns are timestamp and in UNIX time format",
            )
            print(msg)

        return self._get_ts_datasource_v3(
            None,
            cos_in,
            key,
            time_stamp,
            observation,
            cos_out,
            granularity,
            where_clause,
            ops,
            dry_run,
            num_objects=num_objects,
            cast_observation=cast_observation,
            keep_col_names=keep_col_names,
        )

[docs]    def get_ts_datasource_from_table(
        self,
        table_name,
        key,
        time_stamp,
        observation,
        cos_out,
        granularity="raw",
        where_clause="",
        ops="avg",
        dry_run=False,
        keep_col_names: bool = True,
        cast_observation=None,
        num_objects=20,
        print_warning=True,
    ):
        """
        Prepare the data source for time-series in the next-query, in that the result is
        broken down into multiple objects using `num_objects` as the criteria

        It will returns the data source in 3 columns:
            * keep_col_names <- False: use exact values below
                field_name, time_stamp, observation
            * keep_col_names <- True: use the values passed via arguments
                <key>, <time_stamp>, <observation>

        Parameters
        --------------
        table: str
            The catalog table name
            NOTE: Use either `cos_in` or `table`, but not both
        key: str
            The column name being used as the key
        time_stamp: str
            The column name being used as timestick
        observation: str
            The column name being used as value
        cast_observation: str, optional=None
            The type to be casted for the `observation` column
        cos_out: str
            The COS URL where the data is copied to - later as data source
        granularity: str
            a value in one of ["raw", "per_min", "per_<x>min", "per_sec", "per_<x>sec"]
            with <x> is a number divided by 60, e.g. 10, 15
        dry_run: bool, optional
            This option, once selected as True, returns the internally generated SQL statement, and no job is queried.
        num_objects: int, optional
            The number of objects to be created for storing the data
        print_warning: bool, default=True
            print a warning or not
        keep_col_names: bool, optional (False)
            By default, all 3 original column names are maintained.
            If you set to false, they are mapped to `field_name` (for key),
            `time_stamp` and `observation`, respectively.

        Returns
        ----------
        str
            The COS_URL where the data with 3 fields (key, time_stamp, observation)
            and can be digested into time-series via TIME_SERIES_FORMAT(key, timestick, value)
        """
        if len(self._unixtime_columns) == 0 and print_warning is True:
            msg = (
                "WARNING: You may want to assign the list of columns to`.columns_in_unixtime`",
                " to let the SQL client know what columns are timestamp and in UNIX time format",
            )
            print(msg)

        return self._get_ts_datasource_v3(
            table_name,
            None,
            key,
            time_stamp,
            observation,
            cos_out,
            granularity,
            where_clause,
            ops,
            dry_run,
            num_objects=num_objects,
            cast_observation=cast_observation,
            keep_col_names=keep_col_names,
        )

    def _get_ts_datasource_v2(
        self,
        table_name,
        key,
        time_stamp,
        observation,
        cos_out,
        granularity="raw",
        where_clause="",
        ops="avg",
        dry_run=False,
        keep_col_names: bool = True,
        cast_observation=None,
        num_objects=None,
        num_rows=None,
    ):
        """
        Prepare the data source for time-series in the next-query

        It will returns the data source in 3 columns: field_name, time_stamp, observation

        Parameters
        --------------
        table: str
            The catalog table name or the view-table that you generate from the WITH clause via :meth:`with_` method
        key: str
            The column name being used as the key, and is maped to `field_name`
        time_stamp: str
            The column name being used as timetick, and is mapped to `time_stamp`
        observation: str
            The column name being used as value, and is maped to `observation`
        cos_out: str
            The COS URL where the data is copied to - later as data source
        granularity: str
            There are 2 options:

            * a value in one of ["raw", "per_min", "per_<x>min", "per_sec", "per_<x>sec", "per_hour", "per_<x>hour"]
            with <x> is a number divided by 60, e.g. 10, 15
            * a value of "per_day"

            * a value that follows ISO 8601 'duration', e.g. PT1M, PT1S, PT2H
        ops: str
            The aggregation method: "avg", "sum", "max", "min", "count"
        dry_run: bool, optional
            This option, once selected as True, returns the internally generated SQL statement, and no job is queried.
        num_objects: None or int
            The number of objects to be created for storing the data. Using `num_objects` and `num_rows` are exclusive.
        num_rows: None or int
            The number of rows for each object to be created for storing the data. Using `num_objects` and `num_rows` are exclusive.
        keep_col_names: bool, optional (False)
            By default, all 3 original column names are maintained.
            If you set to false, they are mapped to `field_name` (for key),
            `time_stamp` and `observation`, respectively.
        cast_observation: str, optional=None
            The type to be casted for the `observation` column

        Returns
        ----------
        str
            The COS_URL where the data with 3 fields (key, time_stamp, observation)
            and can be digested into time-series via TIME_SERIES_FORMAT(key, timestick, value)

        Raises
        ------
        ISO8601Error
            granularity input is not a valid ISO 8601-compliant value

        """

        """
        WITH tmp_A AS (SELECT storage_account_id as field_name,
                cast(timestamp_finish/1000 as timestamp) as time_stamp,
                request_latency as observation
            FROM cos_cx_exp
            WHERE isnotnull(storage_account_id) AND isnotnull(request_latency)
            AND Year=2020 and Month=09)
    Select                        field_name,
                        to_date(time_stamp) as time_stamp_date,
                        max(observation) as request_latency
                    from tmp_A
                    group by field_name,  time_stamp_date
                    INTO cos://us-south/customer-exp STORED AS PARQUET PARTITIONED INTO 20 OBJECTS

        """
        """
        WITH table_A AS
      (SELECT storage_account_id AS field_name,
              cast(timestamp_finish/1000 AS TIMESTAMP) AS time_stamp,
              request_latency AS observation
       FROM cos_cx_exp
       WHERE isnotnull(storage_account_id)
         AND isnotnull(request_latency)
         AND YEAR=2020
         AND MONTH=09
         AND DAY<5 INTO cos://us-south/customer-exp STORED AS PARQUET PARTITIONED INTO 20 OBJECTS)
    SELECT field_name,
           to_date(time_stamp) AS time_stamp_date,
           hour(time_stamp) AS time_stamp_hour,
           minute(time_stamp) AS time_stamp_minute,
           (floor(second(time_stamp)/1)) * 1 AS time_stamp_second, max(observation) AS request_latency
    FROM table_A stored AS parquet
    GROUP BY field_name,
             to_date(time_stamp),
             hour(time_stamp),
             minute(time_stamp),
             (floor(second(time_stamp)/1)) * 1 INTO cos://us-south/customer-exp STORED AS PARQUET PARTITIONED INTO 20 OBJECTS

    SELECT field_name, to_timestamp(concat(date(time_stamp_date), " ", ":".join([time_stamp_hour, time_stamp_minute, time_stamp_second]), "yyyy-MM-dd HH:mm:ss") AS time_stamp,
                                    request_latency AS observation
    FROM table_A stored AS parquet INTO cos://us-south/customer-exp STORED AS PARQUET PARTITIONED INTO 20 OBJECTS
        """
        tmp_cos = self.target_cos_url
        if num_objects is None and num_rows is None:
            print("provide at least `num_objects` or `num_rows`")
            assert 0
        if num_objects is not None and num_rows is not None:
            print("can't use both `num_objects` and `num_rows`")
            assert 0

        if granularity == "raw":
            pass
        elif granularity == "per_min":
            pass
        if time_stamp in self._unixtime_columns:
            #  -- convert unix-time in ms to sec
            time_info = "cast({time_stamp}/1000 as timestamp)".format(
                time_stamp=time_stamp
            )
        else:
            time_info = "{time_stamp}".format(time_stamp=time_stamp)
        if keep_col_names is True:
            key_col = "field_name as {key}".format(key=key)
            obs_col = "{observation}".format(observation=observation)
        else:
            key_col = "field_name, -- as {key}".format(key=key)
            obs_col = "{observation} as observation".format(observation=observation)

        if granularity == "raw":
            tmp_cos = cos_out

        if len(where_clause) == 0:
            extra_where = ""
        else:
            extra_where = "AND " + where_clause

        if num_objects:
            partition_clause = "PARTITIONED INTO {x} OBJECTS".format(x=num_objects)
        elif num_rows:
            partition_clause = "PARTITIONED EVERY {x} ROWS".format(x=num_rows)
        if cast_observation is None:
            select_stmt = """SELECT {key} as field_name,
                    {time_info} as time_stamp,
                    {observation} as observation
                FROM {table_name}
                WHERE isnotnull({key}) AND isnotnull({observation})
                {extra_where}
                """
        else:
            select_stmt = """SELECT {key} as field_name,
                    {time_info} as time_stamp,
                    cast({observation} as {cast_observation}) as observation
                FROM {table_name}
                WHERE isnotnull({key}) AND isnotnull({observation})
                {extra_where}
                """
        view_stmt = ""
        if self._has_with_clause and not self._has_select_clause:
            # don't support this path
            assert 0
            # self._sql_stmt = self._sql_stmt + select_stmt
        else:
            # self._sql_stmt = select_stmt
            view_stmt = select_stmt
        result = None
        # if dry_run:
        #    print_sql(self._sql_stmt)
        #    select_container_id_full_location = "cos://dry_run/"
        # else:
        #    result = self.execute_sql(self._sql_stmt)
        #    select_container_id_full_location = self.get_job(result.job_id)['resultset_location']

        # cos_in = ""
        if granularity == "raw":
            select_stmt += """
            INTO {cos_out} STORED AS PARQUET {partition_clause}
            """
            select_stmt = select_stmt.format(
                table_name=table_name,
                cos_out=tmp_cos,
                key=key,
                observation=observation,
                time_info=time_info,
                extra_where=extra_where,
                cast_observation=cast_observation,
                partition_clause=partition_clause,
            )
            if dry_run:
                # print_sql(self._sql_stmt)
                print_sql(select_stmt)
            else:
                # result = self.execute_sql(self._sql_stmt)
                result = self.execute_sql(select_stmt)
            return result
        else:
            select_stmt = select_stmt.format(
                table_name=table_name,
                cos_out=tmp_cos,
                key=key,
                observation=observation,
                time_info=time_info,
                extra_where=extra_where,
                cast_observation=cast_observation,
                partition_clause=partition_clause,
            )
            # cos_in = select_container_id_full_location
            # self._sql_stmt = "WITH table_A AS (" + select_stmt + ")"
            view_stmt = "WITH table_A AS (" + select_stmt + ")"
            cos_in = "table_A"

        def query_day():
            """
            return:
                None or 'result'
            """
            nonlocal cos_in

            p = re.compile(r"per_[0-9]*day")
            p2 = re.compile(r"per_[0-9]+day")
            level = "raw"
            num_day = 1
            if p.match(granularity):
                level = "day"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_day = list(map(int, temp))[0]
                    assert 1 % num_day == 0
            else:
                if granularity[0:1].upper() == "P" and granularity[-1].upper() == "D":
                    level = "day"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_day = int(x.total_seconds() / 60 / 60 / 24)  # (day)
                        assert 1 % num_day == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "day":
                if num_day == 1:
                    sql_stmt = """
                    select
                        field_name,
                        to_date(time_stamp) as time_stamp_date,
                        {ops}(observation) as {observation}
                    from {cos_in} stored as parquet
                    group by field_name,  time_stamp_date
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=tmp_cos,
                        key=key,
                        observation=observation,
                        time_info=time_info,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                else:  # num_hour > 1
                    msg = "Can't use a day value > 1"
                    raise Exception(msg)
                    assert 0
                    sql_stmt = """
                    select
                        field_name,
                        to_date(time_stamp) as time_stamp_date,
                        (floor(hour(time_stamp)/{num_hour})) * {num_hour} as time_stamp_hour, -- within [current, current+{num_hour}) hour time-window
                        {ops}(observation) as {observation}
                    from {cos_in} stored as parquet
                    group by field_name,  time_stamp_date, time_stamp_hour
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=tmp_cos,
                        key=key,
                        observation=observation,
                        time_info=time_info,
                        num_hour=num_hour,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                sql_stmt = view_stmt + "\n" + sql_stmt
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                # Restore timestamp as single column
                if dry_run:
                    cos_in = "cos://dry_run/"
                else:
                    cos_in = self.get_job(result.job_id)["resultset_location"]
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", "00:00"), "yyyy-MM-dd HH:mm") as {time_stamp}""".format(
                        time_stamp=time_stamp
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", "00:00"), "yyyy-MM-dd HH:mm") as time_stamp"""
                sql_stmt = """
                select
                    {key_col},
                    {datetime_col},
                    {obs_col}
                from {cos_in} stored as parquet
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=cos_out,
                    key_col=key_col,
                    datetime_col=datetime_col,
                    obs_col=obs_col,
                    time_info=time_info,
                    partition_clause=partition_clause,
                )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)
            return result

        def query_hour():
            """
            return:
                None or 'result'
            """
            nonlocal cos_in

            p = re.compile(r"per_[0-9]*hour")
            p2 = re.compile(r"per_[0-9]+hour")
            level = "raw"
            num_hour = 1
            if p.match(granularity):
                level = "hour"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_hour = list(map(int, temp))[0]
                    assert 24 % num_hour == 0
            else:
                if granularity[0:2].upper() == "PT" and granularity[-1].upper() == "H":
                    level = "hour"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_hour = int(x.total_seconds() / 60 / 60)  # (hour)
                        assert 24 % num_hour == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "hour":
                if num_hour == 1:
                    sql_stmt = """
                    select
                        field_name,
                        to_date(time_stamp) as time_stamp_date,
                        hour(time_stamp) as time_stamp_hour,
                        {ops}(observation) as {observation}
                    from {cos_in} stored as parquet
                    group by field_name,  time_stamp_date, time_stamp_hour
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=tmp_cos,
                        key=key,
                        observation=observation,
                        time_info=time_info,
                        num_hour=num_hour,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                else:  # num_hour > 1
                    sql_stmt = """
                    select
                        field_name,
                        to_date(time_stamp) as time_stamp_date,
                        (floor(hour(time_stamp)/{num_hour})) * {num_hour} as time_stamp_hour, -- within [current, current+{num_hour}) hour time-window
                        {ops}(observation) as {observation}
                    from {cos_in} stored as parquet
                    group by field_name,  time_stamp_date, time_stamp_hour
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=tmp_cos,
                        key=key,
                        observation=observation,
                        time_info=time_info,
                        num_hour=num_hour,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                sql_stmt = view_stmt + "\n" + sql_stmt
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                # Restore timestamp as single column
                if dry_run:
                    cos_in = "cos://dry_run/"
                else:
                    cos_in = self.get_job(result.job_id)["resultset_location"]
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", time_stamp_hour, ":00"), "yyyy-MM-dd HH:mm") as {time_stamp}""".format(
                        time_stamp=time_stamp
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", time_stamp_hour, ":00"), "yyyy-MM-dd HH:mm") as time_stamp"""
                sql_stmt = """
                select
                    {key_col},
                    {datetime_col},
                    {obs_col}
                from {cos_in} stored as parquet
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=cos_out,
                    key_col=key_col,
                    obs_col=obs_col,
                    datetime_col=datetime_col,
                    time_info=time_info,
                    partition_clause=partition_clause,
                )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)
            return result

        def query_min():
            """
            return:
                None or 'result'
            """
            nonlocal cos_in

            p = re.compile(r"per_[0-9]*min")
            p2 = re.compile(r"per_[0-9]+min")
            level = "raw"
            num_min = 1
            if p.match(granularity):
                level = "minute"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_min = list(map(int, temp))[0]
                    assert 60 % num_min == 0
            else:
                if granularity[0:2].upper() == "PT" and granularity[-1].upper() == "M":
                    level = "minute"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_min = int(x.total_seconds() / 60)  # (minute)
                        assert 60 % num_min == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "minute":
                sql_stmt = """
                select
                    field_name,
                    to_date(time_stamp) as time_stamp_date,
                    hour(time_stamp) as time_stamp_hour,
                    (floor(minute(time_stamp)/{num_min})) * {num_min} as time_stamp_minute, -- within [current, current+{num_min}) minute time-window
                    {ops}(observation) as {observation}
                from {cos_in} stored as parquet
                group by field_name,  to_date(time_stamp), hour(time_stamp), (floor(minute(time_stamp)/{num_min})) * {num_min}
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=tmp_cos,
                    key=key,
                    observation=observation,
                    time_info=time_info,
                    num_min=num_min,
                    partition_clause=partition_clause,
                    ops=ops,
                )
                sql_stmt = view_stmt + "\n" + sql_stmt
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                # Restore timestamp as single column
                if dry_run:
                    cos_in = "cos://dry_run/"
                else:
                    cos_in = self.get_job(result.job_id)["resultset_location"]
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", time_stamp_hour, ":", time_stamp_minute), "yyyy-MM-dd HH:mm") as {time_stamp}""".format(
                        time_stamp=time_stamp
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", time_stamp_hour, ":", time_stamp_minute), "yyyy-MM-dd HH:mm") as time_stamp"""
                sql_stmt = """
                select
                    {key_col},
                    {datetime_col},
                    {obs_col}
                from {cos_in} stored as parquet
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=cos_out,
                    key_col=key_col,
                    obs_col=obs_col,
                    datetime_col=datetime_col,
                    time_info=time_info,
                    partition_clause=partition_clause,
                )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)
            return result

        def query_sec():
            """
            return:
                None or 'result'
            """

            p = re.compile(r"per_[0-9]*sec")
            p2 = re.compile(r"per_[0-9]+sec")
            level = "raw"
            num_sec = 1
            if p.match(granularity):
                level = "sec"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_sec = list(map(int, temp))[0]
                    assert 60 % num_sec == 0
            else:
                if granularity[0:2].upper() == "PT" and granularity[-1].upper() == "S":
                    level = "sec"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_sec = int(x.total_seconds())
                        assert 60 % num_sec == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "sec":
                sql_stmt = """
                select
                    field_name,
                    to_date(time_stamp) as time_stamp_date,
                    hour(time_stamp) as time_stamp_hour,
                    minute(time_stamp) as time_stamp_minute,
                    (floor(second(time_stamp)/{num_sec})) * {num_sec} as time_stamp_second, -- within [current, current+{num_sec}) second time-window
                    {ops}(observation) as {observation}
                from {cos_in} stored as parquet
                group by field_name,  to_date(time_stamp), hour(time_stamp), minute(time_stamp), (floor(second(time_stamp)/{num_sec})) * {num_sec}
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=tmp_cos,
                    key=key,
                    observation=observation,
                    time_info=time_info,
                    num_sec=num_sec,
                    partition_clause=partition_clause,
                    ops=ops,
                )
                sql_stmt = view_stmt + "\n" + sql_stmt
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                # Restore timestamp as single column
                if dry_run:
                    init_cos = "cos://dry_run/"
                else:
                    init_cos = self.get_job(result.job_id)["resultset_location"]
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", ":".join([time_stamp_hour, time_stamp_minute, time_stamp_second]), "yyyy-MM-dd HH:mm:ss") as {time_stamp}""".format(
                        time_stamp=time_stamp
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(time_stamp_date), " ", ":".join([time_stamp_hour, time_stamp_minute, time_stamp_second]), "yyyy-MM-dd HH:mm:ss") as time_stamp"""
                sql_stmt = """
                select
                    {key_col},
                    {datetime_col},
                    {obs_col}
                from {cos_in} stored as parquet
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=cos_out,
                    key_col=key_col,
                    obs_col=obs_col,
                    datetime_col=datetime_col,
                    time_info=time_info,
                    partition_clause=partition_clause,
                )
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)
            return result

        result = query_min()
        if result is None:
            result = query_sec()
        if result is None:
            result = query_hour()
        if result is None:
            result = query_day()
        return result

    def _get_ts_datasource_v3(
        self,
        table_name,
        cos_in,
        key,
        time_stamp,
        observation,
        cos_out,
        granularity="raw",
        where_clause="",
        ops="avg",
        dry_run=False,
        keep_col_names: bool = True,
        cast_observation=None,
        num_objects=None,
        num_rows=None,
    ):
        """
        Prepare the data source for time-series in the next-query

        It will returns the data source in 3 columns: field_name, time_stamp, observation

        Parameters
        --------------
        table_name: str
            The catalog table name or the view-table that you generate from the WITH clause via :meth:`with_` method
        cos_in: str
            The COS URL and format of datasource, in case you don't use `table_name`
        key: str
            The column name being used as the key, and is maped to `field_name`
        time_stamp: str
            The column name being used as timetick, and is mapped to `time_stamp`
        observation: str
            The column name being used as value, and is maped to `observation`
        cast_observation: str, optional=None
            The type to be casted for the `observation` column
        cos_out: str
            The COS URL where the data is copied to - later as data source
        granularity: str
            There are 2 options:

            * a value in one of ["raw", "per_min", "per_<x>min", "per_sec", "per_<x>sec", "per_hour", "per_<x>hour"]
            with <x> is a number divided by 60, e.g. 10, 15
            * a value of "per_day"

            * a value that follows ISO 8601 'duration', e.g. PT1M, PT1S, PT2H
        ops: str
            The aggregation method: "avg", "sum", "max", "min", "count"
        dry_run: bool, optional
            This option, once selected as True, returns the internally generated SQL statement, and no job is queried.
        num_objects: None or int
            The number of objects to be created for storing the data. Using `num_objects` and `num_rows` are exclusive.
        num_rows: None or int
            The number of rows for each object to be created for storing the data. Using `num_objects` and `num_rows` are exclusive.
        keep_col_names: bool, optional (False)
            By default, all 3 original column names are maintained.
            If you set to false, they are mapped to `field_name` (for key),
            `time_stamp` and `observation`, respectively.

        Returns
        ----------
        str
            The COS_URL where the data with 3 fields (key, time_stamp, observation)
            and can be digested into time-series via TIME_SERIES_FORMAT(key, timestick, value)

        Raises
        ------
        ISO8601Error
            granularity input is not a valid ISO 8601-compliant value

        """
        """
    WITH table_A AS
      (SELECT storage_account_id AS field_name,
              cast(timestamp_finish/1000 AS TIMESTAMP) AS time_stamp,
              request_latency AS observation
       FROM cos_cx_exp
       WHERE isnotnull(storage_account_id)
         AND isnotnull(request_latency)
         AND YEAR=2020
         AND MONTH=09
         AND DAY<5 INTO cos://us-south/customer-exp STORED AS PARQUET PARTITIONED INTO 20 OBJECTS)
    SELECT field_name,
           to_date(time_stamp) AS time_stamp_date,
           hour(time_stamp) AS time_stamp_hour,
           minute(time_stamp) AS time_stamp_minute,
           (floor(second(time_stamp)/1)) * 1 AS time_stamp_second, max(observation) AS request_latency
           to_timestamp(concat(date(time_stamp_date), " ", ":".join([time_stamp_hour, time_stamp_minute, time_stamp_second]), "yyyy-MM-dd HH:mm:ss") AS time_stamp,
    FROM table_A stored AS parquet
    GROUP BY field_name,
             to_date(time_stamp),
             hour(time_stamp),
             minute(time_stamp),
             (floor(second(time_stamp)/1)) * 1 INTO cos://us-south/customer-exp STORED AS PARQUET PARTITIONED INTO 20 OBJECTS

        HISTORY:
        Aug-05-2021: expand to add `cos_in` argument
        """
        assert not (table_name is None and cos_in is None)
        assert not (table_name is not None and cos_in is not None)
        if cos_in and len(cos_in) > 0:
            table_name = cos_in
        tmp_cos = self.target_cos_url
        if num_objects is None and num_rows is None:
            print("provide at least `num_objects` or `num_rows`")
            assert 0
        if num_objects is not None and num_rows is not None:
            print("can't use both `num_objects` and `num_rows`")
            assert 0

        if granularity == "raw":
            pass
        elif granularity == "per_min":
            pass
        if time_stamp in self._unixtime_columns:
            #  -- convert unix-time in ms to sec
            time_info = "cast({time_stamp}/1000 as timestamp)".format(
                time_stamp=time_stamp
            )
        else:
            time_info = "{time_stamp}".format(time_stamp=time_stamp)
        if keep_col_names is True:
            key_col = "field_name as {key}".format(key=key)
            obs_col = "{ops}(observation) as {observation}".format(
                observation=observation, ops=ops
            )
        else:
            key_col = "field_name, -- as {key}".format(key=key)
            obs_col = "{ops}(observation) as observation".format(
                observation=observation, ops=ops
            )

        if granularity == "raw":
            tmp_cos = cos_out

        if len(where_clause) == 0:
            extra_where = ""
        else:
            extra_where = "AND " + where_clause

        if num_objects:
            partition_clause = "PARTITIONED INTO {x} OBJECTS".format(x=num_objects)
        elif num_rows:
            partition_clause = "PARTITIONED EVERY {x} ROWS".format(x=num_rows)
        if cast_observation is None:
            select_stmt = """SELECT {key} as field_name,
                    {time_info} as time_stamp,
                    {observation} as observation
                FROM {table_name}
                WHERE isnotnull({key}) AND isnotnull({observation})
                {extra_where}
                """
        else:
            select_stmt = """SELECT {key} as field_name,
                    {time_info} as time_stamp,
                    cast({observation} as {cast_observation}) as observation
                FROM {table_name}
                WHERE isnotnull({key}) AND isnotnull({observation})
                {extra_where}
                """
        view_stmt = ""
        if self._has_with_clause and self._has_select_clause:
            # don't support this path
            logger.error("Please reset with `.reset_()`")
            assert 0
            # self._sql_stmt = self._sql_stmt + select_stmt
        elif self._has_with_clause:
            # self._sql_stmt = select_stmt
            # view_stmt = select_stmt
            view_stmt = self._sql_stmt
        result = None
        # if dry_run:
        #    print_sql(self._sql_stmt)
        #    select_container_id_full_location = "cos://dry_run/"
        # else:
        #    result = self.execute_sql(self._sql_stmt)
        #    select_container_id_full_location = self.get_job(result.job_id)['resultset_location']

        # cos_in = ""
        if granularity == "raw":
            select_stmt += """
            INTO {cos_out} STORED AS PARQUET {partition_clause}
            """
            select_stmt = view_stmt + select_stmt
            select_stmt = select_stmt.format(
                table_name=table_name,
                cos_out=tmp_cos,
                key=key,
                observation=observation,
                time_info=time_info,
                extra_where=extra_where,
                cast_observation=cast_observation,
                partition_clause=partition_clause,
            )
            if dry_run:
                # print_sql(self._sql_stmt)
                print_sql(select_stmt)
            else:
                # result = self.execute_sql(self._sql_stmt)
                result = self.execute_sql(select_stmt)
            return result
        else:
            select_stmt = select_stmt.format(
                table_name=table_name,
                cos_out=tmp_cos,
                key=key,
                observation=observation,
                time_info=time_info,
                extra_where=extra_where,
                cast_observation=cast_observation,
                partition_clause=partition_clause,
            )
            # cos_in = select_container_id_full_location
            # self._sql_stmt = "WITH table_A AS (" + select_stmt + ")"
            if len(view_stmt) > 0:
                view_stmt = view_stmt + ", {tmp_table} AS ({sql})".format(
                    tmp_table="table_A", sql=select_stmt
                )
                # cos_in = self._current_vtable_name
                cos_in = "table_A"
                # view_stmt = "WITH table_A AS (" + select_stmt + ")"
            else:
                view_stmt = "WITH table_A AS (" + select_stmt + ")"
                cos_in = "table_A"

        def query_day():
            """
            return:
                None or 'result'
            """
            nonlocal cos_in

            p = re.compile(r"per_[0-9]*day")
            p2 = re.compile(r"per_[0-9]+day")
            level = "raw"
            num_day = 1
            if p.match(granularity):
                level = "day"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_day = list(map(int, temp))[0]
                    assert 1 % num_day == 0
            else:
                if granularity[0:1].upper() == "P" and granularity[-1].upper() == "D":
                    level = "day"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_day = int(x.total_seconds() / 60 / 60 / 24)  # (day)
                        assert 1 % num_day == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "day":
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(time_stamp), " ", "00:00"), "yyyy-MM-dd HH:mm") as {time_stamp}""".format(
                        time_stamp=time_stamp
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(time_stamp), " ", "00:00"), "yyyy-MM-dd HH:mm") as time_stamp"""
                if num_day == 1:
                    sql_stmt = """
                    select
                        {key_col},
                        {obs_col},
                        to_date(time_stamp) as time_stamp_date,
                    {datetime_col}
                    from {cos_in}
                    group by field_name,  to_date(time_stamp)
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=cos_out,
                        key_col=key_col,
                        obs_col=obs_col,
                        datetime_col=datetime_col,
                        time_info=time_info,
                        num_day=num_day,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                else:  # num_day > 1
                    msg = "Can't use a day value > 1"
                    raise Exception(msg)
                    assert 0
                    sql_stmt = """
                    select
                        {key_col},
                        {obs_col},
                        to_date(time_stamp) as time_stamp_date,
                        (floor(hour(time_stamp)/{num_hour})) * {num_hour} as time_stamp_hour, -- within [current, current+{num_hour}) hour time-window
                        {datetime_col}
                    from {cos_in}
                    group by field_name,  time_stamp_date, time_stamp_hour
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=cos_out,
                        key_col=key_col,
                        obs_col=obs_col,
                        datetime_col=datetime_col,
                        time_info=time_info,
                        num_day=num_day,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                sql_stmt = view_stmt + "\n" + sql_stmt
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                ## Restore timestamp as single column
                # if dry_run:
                #    cos_in = "cos://dry_run/"
                # else:
                #    cos_in = self.get_job(result.job_id)['resultset_location']
                # sql_stmt = """
                # select
                #    {key_col},
                #    {datetime_col},
                #    {obs_col}
                # from {cos_in} stored as parquet
                # INTO {cos_out} STORED AS PARQUET {partition_clause}
                # """.format(cos_in=cos_in, cos_out=cos_out, key_col=key_col, datetime_col=datetime_col, obs_col=obs_col, time_info=time_info,
                #    partition_clause=partition_clause)
                # if dry_run:
                #    print_sql(sql_stmt)
                #    result = None
                # else:
                #    result = self.execute_sql(sql_stmt)
            return result

        def query_hour():
            """
            return:
                None or 'result'
            """
            nonlocal cos_in

            p = re.compile(r"per_[0-9]*hour")
            p2 = re.compile(r"per_[0-9]+hour")
            level = "raw"
            num_hour = 1
            if p.match(granularity):
                level = "hour"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_hour = list(map(int, temp))[0]
                    assert 24 % num_hour == 0
            else:
                if granularity[0:2].upper() == "PT" and granularity[-1].upper() == "H":
                    level = "hour"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_hour = int(x.total_seconds() / 60 / 60)  # (hour)
                        assert 24 % num_hour == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "hour":
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(to_date(time_stamp)), " ", hour(time_stamp), ":00"), "yyyy-MM-dd HH:mm") as {time_stamp}""".format(
                        time_stamp=time_stamp
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(to_date(time_stamp)), " ", hour(time_stamp), ":00"), "yyyy-MM-dd HH:mm") as time_stamp"""
                if num_hour == 1:
                    sql_stmt = """
                    select
                        {key_col},
                        {obs_col},
                        to_date(time_stamp) as time_stamp_date,
                        hour(time_stamp) as time_stamp_hour,
                        {datetime_col}
                    from {cos_in}
                    group by field_name,  to_date(time_stamp), hour(time_stamp)
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=cos_out,
                        key_col=key_col,
                        obs_col=obs_col,
                        datetime_col=datetime_col,
                        time_info=time_info,
                        num_hour=num_hour,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                else:  # num_hour > 1
                    sql_stmt = """
                    select
                        {key_col},
                        {obs_col},
                        to_date(time_stamp) as time_stamp_date,
                        (floor(hour(time_stamp)/{num_hour})) * {num_hour} as time_stamp_hour, -- within [current, current+{num_hour}) hour time-window
                        {datetime_col}
                    from {cos_in}
                    group by field_name,  to_date(time_stamp), (floor(hour(time_stamp)/{num_hour})) * {num_hour}
                    INTO {cos_out} STORED AS PARQUET {partition_clause}
                    """.format(
                        cos_in=cos_in,
                        cos_out=cos_out,
                        key_col=key_col,
                        obs_col=obs_col,
                        datetime_col=datetime_col,
                        time_info=time_info,
                        num_hour=num_hour,
                        partition_clause=partition_clause,
                        ops=ops,
                    )
                sql_stmt = view_stmt + "\n" + sql_stmt
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                ## Restore timestamp as single column
                # if dry_run:
                #    cos_in = "cos://dry_run/"
                # else:
                #    cos_in = self.get_job(result.job_id)['resultset_location']
                # sql_stmt = """
                # select
                #    {key_col},
                #    {datetime_col},
                #    {obs_col}
                # from {cos_in} stored as parquet
                # INTO {cos_out} STORED AS PARQUET {partition_clause}
                # """.format(cos_in=cos_in, cos_out=cos_out, key_col=key_col, obs_col=obs_col, datetime_col=datetime_col, time_info=time_info,
                #    partition_clause=partition_clause)
                # if dry_run:
                #    print_sql(sql_stmt)
                #    result = None
                # else:
                #    result = self.execute_sql(sql_stmt)
            return result

        def query_min():
            """
            return:
                None or 'result'
            """
            nonlocal cos_in

            p = re.compile(r"per_[0-9]*min")
            p2 = re.compile(r"per_[0-9]+min")
            level = "raw"
            num_min = 1
            if p.match(granularity):
                level = "minute"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_min = list(map(int, temp))[0]
                    assert 60 % num_min == 0
            else:
                if granularity[0:2].upper() == "PT" and granularity[-1].upper() == "M":
                    level = "minute"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_min = int(x.total_seconds() / 60)  # (minute)
                        assert 60 % num_min == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "minute":
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(to_date(time_stamp)), " ", hour(time_stamp), ":", (floor(minute(time_stamp)/{num_min})) * {num_min}), "yyyy-MM-dd HH:mm") as {time_stamp}""".format(
                        time_stamp=time_stamp, num_min=num_min
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(to_date(time_stamp)), " ", hour(time_stamp), ":", (floor(minute(time_stamp)/{num_min})) * {num_min}), "yyyy-MM-dd HH:mm") as time_stamp""".format(
                        num_min=num_min
                    )
                sql_stmt = """
                select
                    {key_col},
                    {obs_col},
                    to_date(time_stamp) as time_stamp_date,
                    hour(time_stamp) as time_stamp_hour,
                    (floor(minute(time_stamp)/{num_min})) * {num_min} as time_stamp_minute, -- within [current, current+{num_min}) minute time-window
                    {datetime_col}
                from {cos_in}
                group by field_name,  to_date(time_stamp), hour(time_stamp), (floor(minute(time_stamp)/{num_min})) * {num_min}
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=cos_out,
                    key_col=key_col,
                    obs_col=obs_col,
                    datetime_col=datetime_col,
                    time_info=time_info,
                    num_min=num_min,
                    partition_clause=partition_clause,
                    ops=ops,
                )
                sql_stmt = view_stmt + "\n" + sql_stmt
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                ## Restore timestamp as single column
                # if dry_run:
                #    cos_in = "cos://dry_run/"
                # else:
                #    cos_in = self.get_job(result.job_id)['resultset_location']
                # sql_stmt = """
                # select
                #    {key_col},
                #    {datetime_col},
                #    {obs_col}
                # from {cos_in} stored as parquet
                # INTO {cos_out} STORED AS PARQUET {partition_clause}
                # """.format(cos_in=cos_in, cos_out=cos_out, key_col=key_col, obs_col=obs_col, datetime_col=datetime_col, time_info=time_info,
                #    partition_clause=partition_clause)
                # if dry_run:
                #    print_sql(sql_stmt)
                #    result = None
                # else:
                #    result = self.execute_sql(sql_stmt)
            return result

        def query_sec():
            """
            return:
                None or 'result'
            """

            p = re.compile(r"per_[0-9]*sec")
            p2 = re.compile(r"per_[0-9]+sec")
            level = "raw"
            num_sec = 1
            if p.match(granularity):
                level = "sec"
                if p2.match(granularity):
                    temp = re.findall(r"\d+", granularity)
                    num_sec = list(map(int, temp))[0]
                    assert 60 % num_sec == 0
            else:
                if granularity[0:2].upper() == "PT" and granularity[-1].upper() == "S":
                    level = "sec"
                    try:
                        import isodate

                        x = isodate.parse_duration(granularity.strip())
                        num_sec = int(x.total_seconds())
                        assert 60 % num_sec == 0
                    except ISO8601Error:
                        print("%s unsupported" % granularity)
                        assert 0
                else:
                    return None

            if level == "sec":
                if keep_col_names is True:
                    datetime_col = """to_timestamp(concat(date(to_date(time_stamp)), " ", ":".join([hour(time_stamp), minute(time_stamp), (floor(second(time_stamp)/{num_sec})) * {num_sec}]), "yyyy-MM-dd HH:mm:ss") as {time_stamp}""".format(
                        time_stamp=time_stamp, num_sec=num_sec
                    )
                else:
                    datetime_col = """to_timestamp(concat(date(to_date(time_stamp)), " ", ":".join([hour(time_stamp), minute(time_stamp), (floor(second(time_stamp)/{num_sec})) * {num_sec}]), "yyyy-MM-dd HH:mm:ss") as time_stamp""".format(
                        num_sec=num_sec
                    )
                sql_stmt = """
                select
                    {key_col},
                    {obs_col},
                    to_date(time_stamp) as time_stamp_date,
                    hour(time_stamp) as time_stamp_hour,
                    minute(time_stamp) as time_stamp_minute,
                    (floor(second(time_stamp)/{num_sec})) * {num_sec} as time_stamp_second, -- within [current, current+{num_sec}) second time-window,
                    {datetime_col}
                from {cos_in}
                group by field_name,  to_date(time_stamp), hour(time_stamp), minute(time_stamp), (floor(second(time_stamp)/{num_sec})) * {num_sec}
                INTO {cos_out} STORED AS PARQUET {partition_clause}
                """.format(
                    cos_in=cos_in,
                    cos_out=cos_out,
                    key_col=key_col,
                    obs_col=obs_col,
                    datetime_col=datetime_col,
                    time_info=time_info,
                    num_sec=num_sec,
                    partition_clause=partition_clause,
                    ops=ops,
                )
                sql_stmt = view_stmt + "\n" + sql_stmt
                if dry_run:
                    print_sql(sql_stmt)
                    result = None
                else:
                    result = self.execute_sql(sql_stmt)

                # # Restore timestamp as single column
                # if dry_run:
                #    init_cos = "cos://dry_run/"
                # else:
                #    init_cos = self.get_job(result.job_id)['resultset_location']
                # sql_stmt = """
                # select
                #    {key_col},
                #    {datetime_col},
                #    {obs_col}
                # from {cos_in} stored as parquet
                # INTO {cos_out} STORED AS PARQUET {partition_clause}
                # """.format(cos_in=cos_in, cos_out=cos_out, key_col=key_col, obs_col=obs_col,
                #            datetime_col=datetime_col, time_info=time_info,
                #    partition_clause=partition_clause)
                # if dry_run:
                #    print_sql(sql_stmt)
                #    result = None
                # else:
                #    result = self.execute_sql(sql_stmt)
            return result

        result = query_min()
        if result is None:
            result = query_sec()
        if result is None:
            result = query_hour()
        if result is None:
            result = query_day()
        return result

[docs]    def human_form_to_machine_form(self, sql_stmt):
        """ apply magic tricks to convert some useful names, e.g. hour, day, ... to the expected numeric value in [ms] - which is the expected input to TimeSeries functions in SQL
        """
        with lock:
            backup_stmt = self._sql_stmt
            self._sql_stmt = sql_stmt
            self.format_()
            sql_stmt = self._sql_stmt
            self._sql_stmt = backup_stmt
        return sql_stmt

[docs]    def execute_sql(self, *args, **kwargs):
        sql_stmt = args[0]
        tmp = list(args)
        sql_stmt = self.human_form_to_machine_form(sql_stmt)
        tmp[0] = sql_stmt
        args = tuple(tmp)
        return super().execute_sql(*args, **kwargs)

[docs]    def run_sql(self, *args, **kwargs):
        sql_stmt = args[0]
        tmp = list(args)
        sql_stmt = self.human_form_to_machine_form(sql_stmt)
        tmp[0] = sql_stmt
        args = tuple(tmp)
        return super().run_sql(*args, **kwargs)

[docs]    def submit_sql(self, *args, **kwargs):
        sql_stmt = args[0]
        tmp = list(args)
        sql_stmt = self.human_form_to_machine_form(sql_stmt)
        tmp[0] = sql_stmt
        args = tuple(tmp)
        return super().submit_sql(*args, **kwargs)


[docs]def test_run():
    sqlClient = SQLClient()
    sql_stmt = """
     WITH
        hourly_avg_table as (
            SELECT
                location,
                TS_SEG_AVG(TS_SEGMENT_BY_TIME(pm, 3600000, 3600000)) as hourly_avg_pm_ts,
                TS_SEG_AVG(TS_SEGMENT_BY_TIME(pm, hour , hour)) as hourly_avg_pm_ts_2
                TS_SEG_AVG(TS_SEGMENT_BY_TIME(pm, hour , hour)) as hourly_avg_pm_ts_2
                TS_SEG_AVG(TS_SEGMENT_BY_TIME(pm, hour , 3600000)) as hourly_avg_pm_ts_2
            FROM cos://s3.us-south.cloud-object-storage.appdomain.cloud/tuan-sql-result/jobid=f6cee0f2-5524-43cd-8aee-ddc3c428868f STORED AS PARQUET)
    SELECT
        location,
        TS_FILLNA(hourly_avg_pm_ts, TS_INTERPOLATOR_LINEAR(0.0,1,1)) as hourly_avg_pm_ts
    FROM hourly_avg_table
    INTO cos://us-south/tuan-sql-result STORED AS PARQUET
    """
    sqlClient._sql_stmt = sql_stmt
    sqlClient.run()
ibmcloudsql

Navigation

Related Topics

Source code for ibmcloudsql.sql_query_ts