private void pi() {
log.info("----- start pi -----");
final String javaHome = System.getenv("java_HOME");
final String hadoopConfDir = System.getenv("hadoop_CONF_DIR");
log.info("javaHome: " javaHome);
log.info("hadoopConfDir: " hadoopConfDir);
log.info("SPARKHome: " sparkHome);
log.info("mode: " deployMode);
log.info("appResource: " sparkJar);
log.info("mainClass: " mainClass);
final String[] args = new String[]{
"--jar", sparkJar, "--class", mainClass, "--arg", "10"};
String appName = "spark-yarn";
System.setProperty("SPARK_YARN_MODE", "true");
SparkConf sparkConf = new SparkConf();
sparkConf.setSparkHome(sparkHome);
sparkConf.setMaster("yarn");
sparkConf.setAppName(appName);
sparkConf.set("spark.submit.deployMode", "cluster");
String jarDir = "hdfs://sh01:9000/user/deployer/spark-jars/*.jar";
log.info("jarDir: " jarDir);
sparkConf.set("spark.yarn.jars", jarDir);
if (enableKerberos) {
log.info("---------------- enable kerberos ------------------");
sparkConf.set("spark.hadoop.hadoop.security.authentication", "kerberos");
sparkConf.set("spark.hadoop.hadoop.security.authorization", "true");
sparkConf.set("spark.hadoop.dfs.namenode.kerberos.principal", "hdfs/_HOST@KPP.COM");
sparkConf.set("spark.hadoop.yarn.resourcemanager.principal", "yarn/_HOST@KPP.COM");
}
ClientArguments clientArguments = new ClientArguments(args);
Client client = new Client(clientArguments, sparkConf);
// client.run();
ApplicationId applicationId = client.submitApplication();
log.info("submit task [{}] and application id [{}] ", appName, applicationId.getId());
YarnAppReport yarnAppReport = client.monitorApplication(applicationId, false, true, 1000);
log.info("task [{}] process result [{}]", appName, yarnAppReport.finalState());
if (yarnAppReport.finalState().equals(FinalApplicationStatus.SUCCEEDED)) {
log.info("spark任务执行成功");
} else {
log.info("spark任务执行失败");
}
log.info("----- finish pi -----");
}
,我来为大家讲解一下关于spark用java和scala写有什么区别?跟着小编一起来看一看吧!
spark用java和scala写有什么区别
核心代码
private void pi() {
log.info("----- start pi -----");
final String javaHome = System.getenv("java_HOME");
final String hadoopConfDir = System.getenv("hadoop_CONF_DIR");
log.info("javaHome: " javaHome);
log.info("hadoopConfDir: " hadoopConfDir);
log.info("SPARKHome: " sparkHome);
log.info("mode: " deployMode);
log.info("appResource: " sparkJar);
log.info("mainClass: " mainClass);
final String[] args = new String[]{
"--jar", sparkJar, "--class", mainClass, "--arg", "10"};
String appName = "spark-yarn";
System.setProperty("SPARK_YARN_MODE", "true");
SparkConf sparkConf = new SparkConf();
sparkConf.setSparkHome(sparkHome);
sparkConf.setMaster("yarn");
sparkConf.setAppName(appName);
sparkConf.set("spark.submit.deployMode", "cluster");
String jarDir = "hdfs://sh01:9000/user/deployer/spark-jars/*.jar";
log.info("jarDir: " jarDir);
sparkConf.set("spark.yarn.jars", jarDir);
if (enableKerberos) {
log.info("---------------- enable kerberos ------------------");
sparkConf.set("spark.hadoop.hadoop.security.authentication", "kerberos");
sparkConf.set("spark.hadoop.hadoop.security.authorization", "true");
sparkConf.set("spark.hadoop.dfs.namenode.kerberos.principal", "hdfs/_HOST@KPP.COM");
sparkConf.set("spark.hadoop.yarn.resourcemanager.principal", "yarn/_HOST@KPP.COM");
}
ClientArguments clientArguments = new ClientArguments(args);
Client client = new Client(clientArguments, sparkConf);
// client.run();
ApplicationId applicationId = client.submitApplication();
log.info("submit task [{}] and application id [{}] ", appName, applicationId.getId());
YarnAppReport yarnAppReport = client.monitorApplication(applicationId, false, true, 1000);
log.info("task [{}] process result [{}]", appName, yarnAppReport.finalState());
if (yarnAppReport.finalState().equals(FinalApplicationStatus.SUCCEEDED)) {
log.info("spark任务执行成功");
} else {
log.info("spark任务执行失败");
}
log.info("----- finish pi -----");
}
client.run() 是同步的,spark 任务结束前该行一下的代码不会执行。该方法的无返回值,也就是说拿不到 spark 任务执行的任何信息。
client.submitApplication() 是异步的,提交任务后立即执行该行下的代码。但是该方法会返回 ApplicationId ,这个就很有用啦。接下来可以调用 monitorApplication 方法让 java 代码 block 住,并且拿到 spark 任务执行的一些信息。
YarnAppReport yarnAppReport = client.monitorApplication(applicationId, false, true, 1000);
public YarnAppReport monitorApplication(final ApplicationId appId, final boolean returnOnRunning, final boolean logApplicationReport, final long interval) {
// 代码就不贴了,有需要自己去看喽。
}
- applicationId 就不用说啦,肯定是spark job 的 id。returnOnRunningtrue :当 spark job 处于 RUNNING 状态时,monitorApplication 方法结束 block,返回 yarnAppReport。false : monitorApplication 等待 spark job 执行完毕结束 block,返回 yarnAppReport。当然如果 spark job 里面有 bug,那该啥时返回就啥时返回,具体的可以看下源代码,只需要看清楚几个关键环节就行。logApplicationReport 控制是否在输出 spark job 执行时的日志。interval 间隔多尝试间去轮询一次 spark job。源代码里面写的是 while(true) 循环。YarnAppReport 中持有 spark 任务的状态 以及其他信息,具体内容自己可以去里面搜。很显然,client.submitApplication() 更有操作空间。