Using ibis, impyla, pyhive and pyspark to connect to Hive and Impala of Kerberos security authentication in Python

Keywords: hive SQL Spark Database

There are many ways to connect hive and impala in python, including pyhive,impyla,pyspark,ibis, etc. in this article, we will introduce how to use these packages to connect hive or impala, and how to pass kerberos authentication.

Kerberos

If the cluster does not enable kerberos authentication, the code here is not needed, or the code here is not needed to pass the kinit command authentication in the system environment.

krbcontext.context_shell

# -*- coding: utf-8 -*-

__all__ = [ 'krbcontext', 'KRB5KinitError', ]

import os, sys
# import pwd
import subprocess


from contextlib import contextmanager

class KRB5KinitError(Exception):
    pass

# def get_login():
#     ''' Get current effective user name '''
#
#     return pwd.getpwuid(os.getuid()).pw_name

def init_ccache_as_regular_user(principal=None, ccache_file=None):
    '''Initialize credential cache as a regular user

    Return the filename of newly initialized credential cache
    '''

    if not sys.stdin.isatty():
        raise IOError('This is not running on console. So, you need to run kinit '
                      'with your principal manually before anything goes.')

    cmd = 'kinit %(ccache_file)s %(principal)s'
    args = {}

    args['principal'] = principal
    args['ccache_file'] = '-c %s' % ccache_file

    kinit_proc = subprocess.Popen(
        (cmd % args).split(),
        stderr=subprocess.PIPE)
    stdout_data, stderr_data = kinit_proc.communicate()

    if kinit_proc.returncode > 0:
        raise KRB5KinitError(stderr_data)

    return ccache_file



def init_ccache_with_keytab(principal, keytab_file, ccache_file):
    '''Initialize credential cache using keytab file

    Return the filename of newly initialized credential cache
    '''
    cmd = 'kinit -kt %(keytab_file)s -c %(ccache_file)s %(principal)s'
    args = {}

    args['principal'] = principal
    args['ccache_file'] = ccache_file
    args['keytab_file'] = keytab_file

    kinit_proc = subprocess.Popen(
        (cmd % args).split(),
        stderr=subprocess.PIPE)
    stdout_data, stderr_data = kinit_proc.communicate()

    if kinit_proc.returncode > 0:
        raise KRB5KinitError(stderr_data)

    return ccache_file


@contextmanager
def krbcontext(using_keytab=False, **kwargs):
    '''A context manager for Kerberos-related actions

    using_keytab: specify to use Keytab file in Kerberos context if True,
                  or be as a regular user.
    kwargs: contains the necessary arguments used in kerberos context.
            It can contain principal, keytab_file, ccache_file.
            When you want to use Keytab file, keytab_file must be included.
    '''

    env_name = 'KRB5CCNAME'
    old_ccache = os.getenv(env_name)
    if using_keytab:
        ccache_file = init_ccache_with_keytab(**kwargs)
    else:
        ccache_file = init_ccache_as_regular_user(kwargs.get("principal"), kwargs.get("ccache_file"))
    os.environ[env_name] = ccache_file
    yield

pyhive

Connect hive with pyhive

Environmental Science

 """
 decorator==4.4.2
 future==0.18.2
 gssapi==1.6.5
 krbcontext==0.10
 PyHive==0.5.0
 impyla==0.14.1
 sasl==0.2.1
 six==1.11.0
 thrift_sasl==0.3.0  # If using ibis or impyla requires thrift_sasl==0.2.1
 thrift==0.13.0
 thriftpy==0.3.5
 """

from pyhive import sqlalchemy_hive,hive
from krbcontext.context_shell import krbcontext

Kerberos authentication

There are two ways. One is to let the current user pass the kinit to maintain the kerberos authentication in the system environment. Then there is no need to write the kerberos authentication code in all the codes. Another way is to use the following code to enter the kerberos authentication session in the python script:

config = {
    "kerberos_principal": "hive@CSDNTEST.COM.LOCAL",
    "keytab_file": '/home/tools/wyk/keytab/hive.keytab',  
    "kerberos_ccache_file": '/home/tools/wyk/keytab/hive_ccache_uid',
    "AUTH_MECHANISM": "GSSAPI"
}
with krbcontext(using_keytab=True,
                               principal=config['kerberos_principal'],
                               keytab_file=config['keytab_file'],
                               ccache_file=config['kerberos_ccache_file']):
    #The scripts executed in this code block are all in kerberos authentication.
    #You can query hdfs hive hbase and so on in this code block as long as it is a component with permission of kerberos principal

use

The following code needs to be in the above kerberos code block to pass kerberos authentication. If the cluster has not started kerberos authentication or the current system has used kinit for authentication, the above code is not required:

con = hive.connect(host='uatnd02.csdntest.com.local',port=10000,auth='KERBEROS',kerberos_service_name="hive") #host is the node of hiveserver2, port is 10000 by default, and it is the port of hs2
    cursor = con.cursor()
    cursor.execute('select * from dl_nccp.account limit 5') #No semicolons!
    # cursor.execute('desc dl_nccp.account') #No semicolons!
    datas = cursor.fetchall()
    print(datas)
    cursor.close()
    con.close()

 

impyla

Environmental Science

 """
 decorator==4.4.2
 future==0.18.2
 gssapi==1.6.5
 krbcontext==0.10
 PyHive==0.5.0
 impyla==0.14.1
 sasl==0.2.1
 six==1.11.0
 thrift_sasl==0.2.1  # Only this one is different from pyhive's
 thrift==0.13.0
 thriftpy==0.3.5
 """

from impala.dbapi import connect
from krbcontext.context_shell import krbcontext

Kerberos authentication

Like pyhive above, slightly

use

#Impyla thrift = = 0.2.1 is the same as the dependent version of ibis, which can be used at the same time
    conn = connect(host='uatnd02.csdntest.com.local', port=10000, auth_mechanism='GSSAPI',kerberos_service_name='hive')
    cur = conn.cursor()
    cur.execute('SHOW databases') #No semicolons
    cur.table_exists(table_name='account',database_name='dl_nccp') #return True or False
    cur.ping() #return True or False
    cur.status() #return True or False
    cur.get_table_schema(table_name='account',database_name='dl_nccp') #The return table structure is similar to desc
    print(cur.fetchall())
    cur.close()
    conn.close()

ibis

ibis is a very powerful third-party environment package, which supports access to all kinds of databases or file systems. It has powerful functions and returns to Panda data frame format, which is very friendly to data developers. I have used this package for more than two years, and it is highly recommended.

Official documents: https://docs.ibis-project.org/getting-started.html

Environmental Science

 """
 ibis-framework==0.14.0  #pip install ibis-framework[impala]==0.14.0
 decorator==4.4.2
 future==0.18.2
 gssapi==1.6.5
 krbcontext==0.10
 PyHive==0.5.0
 impyla==0.14.1
 sasl==0.2.1
 six==1.11.0
 thrift_sasl==0.2.1  # Only this one is different from pyhive's
 thrift==0.13.0
 thriftpy==0.3.5
 """

Kerberos authentication

Like pyhive above, slightly

use

There are many functions available in the official documents and source code. Here are only some commonly used functions. It is recommended to see the source code or official documents for those who use the ibis package.

import ibis
import pandas as pd
import ibis.expr.datatypes as dt
conf={
"impala_host":"uathd01.csdntest.com.local",
"impala_port":21050,
"kerberos_service_name":"impala",
"auth_mechanism":"GSSAPI",
"webhdfs_host1":"uatnd01.csdntest.com.local",
"webhdfs_host2":"uatnd02.csdntest.com.local",
"webhdfs_port":9870   #If it is the version before Hadoop 3, the parameter here is changed to 50070
}
#Get hdfs connection
try:
    hdfs_client = ibis.hdfs_connect(host = conf["webhdfs_host2"], port = conf["webhdfs_port"], auth_mechanism = conf["auth_mechanism"], use_https = False, verify = False)
    hdfs_client.ls('/')
except:
    hdfs_client = ibis.hdfs_connect(host = conf["webhdfs_host1"], port = conf["webhdfs_port"], auth_mechanism = conf["auth_mechanism"], use_https = False, verify = False)

#Get impala connection
impala_client = ibis.impala.connect(host=conf["impala_host"], port=conf["impala_port"], hdfs_client = hdfs_client, auth_mechanism=conf["auth_mechanism"], timeout = 300)

# select * from dh_sales.r_order limit 10, return pandas dataframe
res = impala_client.table('r_order', database='dh_sales').execute(limit=10)
print(type(res))
print(res.dtypes)
print(res)

#Query impala with SQL
res = impala_client.sql("""select area,is_oversea,dw_ins_ts,sum(gid_cnt) from dh_t3_report_crm.r_duration_area_metr where is_oversea=0 group by 1,2,3 limit 5""")
df_res = res.execute()
print(df_res)

#All tables under the column Library
impala_client.list_tables(database='dl_common')

#Using pandas dataframe to create tables and automatically map types
impala_client.create_table(table_name='ibis_create0602',obj=sys_dict,database='default',force=True)
...
#pd_res_acc omitted
#pd_res_cre slightly
...
#Insert data using pandas datraframe
impala_client.insert(table_name='tmp_account',obj=pd_res_acc,database='default',overwrite=True)
impala_client.insert(table_name='ibis_create',obj=pd_res_cre,database='default',overwrite=True)

 

BUG resolution

The following error may be reported when executing the insert or create command. This is a source level BUG. Please refer to the following link to modify the source code to solve this problem:

UnicodeEncodeError: 'latin-1' codec can't encode characters in position 160-161: ordinal not in range(256)

reference resources: https://github.com/ibis-project/ibis/issues/2120

vim /home/tools/python3/Python-3.6.8/lib/python3.6/site-packages/hdfs/client.py 

 

pyspark

Environmental Science

vim /etc/profile

export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark
export SPARK_CONF_DIR=$SPARK_HOME/conf
export PYTHONPATH=/home/tools/anaconda3/envs/csdn/bin
export PYSPARK_PYTHON=/home/tools/anaconda3/envs/csdn/bin/python3.6
"""
pyspark==2.4.5
"""

kerberos authentication

Like pyhive above, slightly

use

Use pyspark to connect hive for query, and change spark dataframe to Panda dataframe:

from __future__ import division
#import findspark as fs
#fs.init()
import pandas as pd
from pyspark.sql import HiveContext,SparkSession,SQLContext
from pyspark import SparkContext, SparkConf
import pyspark.sql.functions as F
import datetime as dt
from datetime import datetime
import random
import numpy as np
from log3 import log_to_file, log
from pyspark.sql.types import *

conf = SparkConf().setMaster("yarn").setAppName("MF_return_calc")
sc = SparkContext(conf=conf)
sc.setLogLevel("WARN")
hiveCtx = HiveContext(sc)
spark = SparkSession.builder.master("yarn").appName("MF_return_calc").config("spark.debug.maxToStringFields", "100").getOrCreate()

#Execute SQL
test_sql = """select * from dl_nccp.account limit 5"""
res = hiveCtx.sql(test_sql)
type(res) #Return spark dataframe
res.head(3)
res_pd = res.toPandas() #Convert spark dataframe to Panda dataframe
res_pd

 

OK, the above four ways to visit hive and impala in python are introduced. I hope they can help you. Thank you!

I hope this article can help you, please give me a compliment and encourage the author ~ thank you!

Posted by RunningUtes on Mon, 08 Jun 2020 23:22:07 -0700