Please note that Datameer doesn't recommend using direct access to the application database in production environments on a regular basis. The database schema might change from version to version without notification.
Ensure that you always have a recent backup before executing any query in the production database.
Database Status Queries
#Display the current processes show full processlist;
#Display the innoDb's status (good for viewing dead locks) show engine innodb status;
Job Link Queries
#Import to Workbook link select df3.id, df3.name ImportName, wc.id WbConfigId, df.name WorkbookName, sht.id SheetId, sht.name SheetName from workbook_configuration wc inner join dap_job_configuration djc on wc.id = djc.id inner join dap_file df on djc.dap_file__id = df.id inner join sheet sht on wc.id = sht.workbook_fk inner join data_source_configuration dsc on sht.data_source_id = dsc.id inner join dap_job_configuration djc3 on dsc.id = djc3.id inner join dap_file df3 on djc3.dap_file__id = df3.id order by df3.name;
#Workbook to Workbook link select sht.external_sheet__id, df2.name ParentWorkbookName, sht2.name ParentSheetName, wc.id WbConfigId, df.name WorkbookName, sht.id SheetId, sht.name SheetName from workbook_configuration wc inner join dap_job_configuration djc on wc.id = djc.id inner join dap_file df on djc.dap_file__id = df.id inner join sheet sht on wc.id = sht.workbook_fk inner join sheet sht2 on sht.external_sheet__id = sht2.id inner join workbook_configuration wc2 on wc2.id = sht2.workbook_fk inner join dap_job_configuration djc2 on wc2.id = djc2.id inner join dap_file df2 on djc2.dap_file__id = df2.id order by df2.name;
#Workbook to Infographic link select wc.id WbConfigId, df2.name WorkbookName, sht.id SheetId, sht.name SheetName, igm.id IGMId, df.id DapFileId, df.name InfographicName from infographic_data_set igds inner join infographic_model_infographic_data_set igmigds on igmigds.data_sets__id = igds.id inner join infographic_model igm on igmigds.infographic_model__id = igm.id inner join dap_file df on igm.dap_file__id = df.id inner join sheet sht on igds.data_set_name = sht.name inner join workbook_configuration wc on wc.id = sht.workbook_fk inner join dap_job_configuration djc on wc.id = djc.id inner join dap_file df2 on djc.dap_file__id = df2.id order by igm.id;
#Workbook to ExportJob link select wc.id WbConfigId, df2.name WorkbookName, sht.id SheetId, sht.name SheetName, df.id DapFileId, df.name ExportJobName from dap_file df inner join dap_job_configuration djc on df.id = djc.dap_file__id inner join data_sink_configuration dsc on djc.id = dsc.id inner join sheet sht on dsc.sheet_fk = sht.id inner join workbook_configuration wc on wc.id = sht.workbook_fk inner join dap_job_configuration djc2 on wc.id = djc2.id inner join dap_file df2 on djc2.dap_file__id = df2.id order by df2.name;
# Data Connection to Import Data select ds.type_id DataSourceType, df.id DataStoreId, df.name DataStoreName, df2.id ImportDataId, df2.name ImportDataName from data_store ds inner join dap_file df on ds.dap_file__id = df.id inner join data_source_configuration dsc on ds.id = dsc.connection_fk inner join dap_job_configuration djc on dsc.id = djc.id inner join dap_file df2 on djc.dap_file__id = df2.id;
# Data Connection to Import Data with Permission and Groups Information select ds.type_id DataSourceType, df.id DataStoreId, df.name DataStoreName, df2.id ImportDataId, df2.name ImportDataName, p.other_permission_bits ImportWorldPermissions, p.owner ImportOwner, gp.group_name ImportGroupName, gp.permission_bits ImportGroupPermission from data_store ds inner join dap_file df on ds.dap_file__id = df.id inner join data_source_configuration dsc on ds.id = dsc.connection_fk inner join dap_job_configuration djc on dsc.id = djc.id inner join dap_file df2 on djc.dap_file__id = df2.id inner join permission p on df2.permission_fk = p.id left outer join group_permission gp on p.id = gp.permission__id order by df.id, df2.id;
The SQL query below lists the data source type, id, and name, and then any dependent import jobs, their id, name, owner and table name, and also any dependent workbooks with their id, name and specific sheet id and name.
#Data connextion to Import Job + ID, Name, Owner + Workbook SELECT ds.type_id DataSourceType, dsf.id DataStoreId, dsf.name DataStoreName, ijf.id ImportDataId, ijf.name ImportDataName, p.owner ImportOwner, jcp.property_value as TableName, wc.id WbConfigId, wbf.name WorkbookName, sht.id SheetId, sht.name SheetName FROM workbook_configuration wc, dap_job_configuration wbc, dap_file wbf, sheet sht, data_source_configuration ijc, dap_job_configuration dsc, dap_file ijf, data_store ds, dap_file dsf, data_source_configuration dssc, dap_job_configuration ijjc, permission p, job_configuration_property jcp WHERE jcp.property_key = 'tableName' AND wc.id = wbc.id AND wbc.dap_file__id = wbf.id AND wc.id = sht.workbook_fk AND sht.data_source_id = ijc.id AND ijc.id = dsc.id AND dsc.dap_file__id = ijf.id AND ds.dap_file__id = dsf.id AND ds.id = dssc.connection_fk AND dssc.id = ijjc.id AND ijjc.dap_file__id = ijf.id AND ijf.permission_fk = p.id AND ijjc.id = jcp.configuration_fk;
This query contains a stored procedure which uses the parent - child folder relationships in the Datameer database and then constructs a location path to artifacts and outputs these in the result.
#Data Connection to Export Data SELECT ds.type_id DataSourceType, df.id DataStoreId, df.name DataStoreName, df2.id ExportDataId, df2.name ExportDataName FROM data_store ds INNER JOIN dap_file df ON ds.dap_file__id = df.id INNER JOIN data_sink_configuration dsc ON ds.id = dsc.connection_fk INNER JOIN dap_job_configuration djc ON dsc.id = djc.id INNER JOIN dap_file df2 ON djc.dap_file__id = df2.id;
The next request finds all the workbooks and export jobs based on the list of the workbook job execution IDs specified in the 'where' clause. Replace the (1,2,3,4,5,6,7,8,9,10) with the actual workbook job ID list.
#Job Execution ID to Workbooks and Export Jobs SELECT group_concat(dje.id separator ',') as ListOfJobIDs,djc.id as WorkbookID,df1.name as WorkbookName,djc2.id as ExportJobID,df2.name as ExportJobName FROM file_dependency fd INNER JOIN dap_file df1 ON df1.id=fd.dependency_file_id AND df1.extension='WORKBOOK_EXTENSION' INNER JOIN dap_file df2 ON df2.id=fd.source_file_id AND df2.extension='EXPORT_JOB_EXTENSION' INNER JOIN dap_job_configuration djc ON djc.dap_file__id=df1.id INNER JOIN dap_job_execution dje ON dje.dap_job_configuration__id=djc.id INNER JOIN dap_job_configuration djc2 ON djc2.dap_file__id=df2.id WHERE dje.id in (1,2,3,4,5,6,7,8,9,10) GROUP BY ExportJobID;
This statement finds all the import jobs and data links linked to the workbooks based on the workbook execution job IDs. Replace the (1,2,3,4,5,6,7,8,9,10) with the actual workbook job ID list.
#Workbook Execution Job ID to Import Job and Data Link SELECT group_concat(dje.id separator ',') AS ListOfJobIDs, djc.id AS WorkbookID, djc2.id AS ImportID, df1.name AS ImportName, df2.name AS WorkbookName FROM file_dependency fd INNER JOIN dap_file df1 on df1.id=fd.dependency_file_id AND (df1.extension='IMPORT_JOB_EXTENSION' OR df1.extension='IMPORT_LINK_JOB_EXTENSION') INNER JOIN dap_file df2 on df2.id=fd.source_file_id AND df2.extension='WORKBOOK_EXTENSION' INNER JOIN dap_job_configuration djc ON djc.dap_file__id=df2.id INNER JOIN dap_job_execution dje ON dje.dap_job_configuration__id=djc.id INNER JOIN dap_job_configuration djc2 ON djc2.dap_file__id=df1.id WHERE dje.id in (1,2,3,4,5,6,7,8,9,10) GROUP BY ImportID;
Find all shared groups for a particular job:
#Shared groups for job SELECT * FROM group_permission gp JOIN dap_file df ON gp.permission__id = df.permission_fk JOIN dap_job_configuration djc ON df.id = djc.dap_file__id WHERE djc.id = 8;
Job Execution Queries
This query provides the job start/end time and some additional information. To get the mappers and reducers and time between tasks, you need to save and sift through the job trace and task logs.
#Current state of executions. select djc.id JobConfigId, CASE dje.job_status WHEN 0 THEN 'QUEUED' WHEN 1 THEN 'RUNNING' WHEN 2 THEN 'COMPLETED' WHEN 3 THEN 'ERROR' WHEN 4 THEN 'COMPLETED_WITH_WARNINGS' WHEN 5 THEN 'WAITING_FOR_OTHER_JOB' WHEN 6 THEN 'CANCELED' WHEN 7 THEN 'ABORTING' WHEN 8 THEN 'WAITING' ELSE 'OTHER' END JobConfigStatus, dje.id JobExeId, dje.start_time StartTime, dje.stop_time StopTime, df.name Name, dje.user Executor, TIMEDIFF(dje.stop_time, dje.start_time) Duration from dap_job_configuration djc inner join dap_file df on djc.dap_file__id = df.id inner join dap_job_execution dje on djc.id = dje.dap_job_configuration__id;
Note that Datameer only keeps the last 28 days of execution data until it is expunged. This is a configuration value found in the <DM_HOME>/conf/default.properties
.
# Define the maximum number of days job executions are saved in the job history, after a job has been completed. housekeeping.execution.max-age=28d
# All data related to a workbook w/ and w/out an execution id select djc.id JobConfigId, djc.dap_file__id ConfigFileId, df.name ConfigName, d.id DataId, d.uri Uri, df2.id DataFileId, df2.name DataName, df2.creation_date DataCreationDate, df2.extension DataFileExtension, df2.file_mode DataFileMode, dje.id ExeId, dje.created_data__id ExeDataId from dap_job_configuration djc inner join dap_file df on djc.dap_file__id = df.id inner join data d on djc.id = d.dap_job_configuration__id inner join dap_file df2 on d.dap_file__id = df2.id left outer join dap_job_execution dje on djc.id = dje.dap_job_configuration__id and d.id = dje.created_data__id where djc.id=<id>;
#workbook to worksheet to current selected sheet select wc.id WbConfigId, df.id DapFileId, df.name WbName, sht.id SheetId, sht.name SheetName, sht.position Position, sht.keep Keep, wbv.current_sheet_index CurrentSheetNo from workbook_configuration wc inner join dap_job_configuration djc on wc.id = djc.id inner join dap_file df on djc.dap_file__id = df.id inner join sheet sht on wc.id = sht.workbook_fk inner join workbook_viewstate wbv on wc.workbook_view_fk = wbv.id order by df.name, sht.position;
Job Configuration Queries
#Find Configurations with most data objects SELECT dap_job_configuration__id, count(*) AS c FROM data WHERE data.status = 0 GROUP BY data.dap_job_configuration__id ORDER BY c DESC limit 25;
#Datastore Config Info SELECT ds.id DataStoreId, dscp.property_key, dscp.property_value, df.name Name FROM data_store ds INNER JOIN data_store_configuration_property dscpON dscp.configuration_fk = ds.id INNER JOIN dap_file df ON ds.dap_file__id = df.id WHERE df.name = 'SystemDatabase';
#Get data-source with max-mapper settings SELECT dsc.id, dsc.max_mappers, ds.type_id, df.extension FROM data_source_configuration dsc, data_store ds, dap_job_configuration djc, dap_file df WHERE dsc.connection_fk = ds.id AND dsc.id = djc.id AND df.id=djc.dap_file__id AND dsc.max_mappers IS NOT NULL;
#Get custom hadoop properties from data-stores SELECT dscp.id, dscp.property_key, dscp.property_value, ds.type_id FROM data_store_configuration_property AS dscp INNER JOIN data_store AS ds ON dscp.configuration_fk = ds.id WHERE dscp.property_key='hadoop.properties';
#Get job-configurations with max-split settings SELECT * FROM dap_job_configuration WHERE hadoop_properties LIKE "%das.splitting.max-split-count%";
#Get job-configurations of user SELECT djc.id, df.name, djc.hadoop_properties FROM dap_job_configuration djc INNER JOIN dap_file df ON djc.dap_file__id = df.id INNER JOIN permission p ON df.permission_fk = p.id WHERE p.owner = 'system'
#Get hadoop properties from cluster setup SELECT * FROM property WHERE name = "hadoop.customPropertiesString";
#Custom "Table Name" query for Export Job SELECT name ExportJobName, table_name DbTableName, create_new_table NewTable, record_replacement ReplaceRecords FROM data_base_data_sink dbds INNER JOIN dap_job_configuration djc ON dbds.id = djc.id INNER JOIN dap_file df ON djc.dap_file__id = df.id WHERE dbds.id = '<configurationID>';
Whereby the configuration ID is the job configuration ID of your particular export job.
License Volume Queries
#License Consumption By Day select day, round((sum(volume)/1024)/1024,2) volume_mb from data_volume_summary group by day order by day; #License Consumption By Month select SUBSTRING(day,1,7) 'month', round(((sum(volume)/1024)/1024)/1024,2) volume_gb from data_volume_summary group by SUBSTRING(day,1,7) order by day; #License Consumption By Year select SUBSTRING(day,1,4) 'year', round((((sum(volume)/1024)/1024)/1024)/1024,4) volume_tb from data_volume_summary group by SUBSTRING(day,1,4) order by day;
Artifact Volume Queries
#List paths of Workbooks and Worksheets with disk usage /* Aggregated by Workbook name */
SELECT d.uri URI,
df.name,
wc.id ConfigID,
sum(wsd.kept) AS Kept,
count(wsd.kept) AS TotalSheets,
(sum(wsd.kept) / count(wsd.kept)) * 100 AS Percentage,
(sum(wsd.data_statistic__bytes) / 1024 / 1024) AS TotalMBytes
FROM workbook_configuration wc,
workbook_sheet_data wsd,
data d, dap_file df,
dap_job_configuration djc
WHERE d.dap_job_configuration__id = wc.id
AND d.id=wsd.workbook_data_fk
AND d.dap_job_configuration__id = djc.id
AND df.id = djc.dap_file__id
GROUP BY wc.id, d.uri, df.name
ORDER BY TotalMBytes desc,
Percentage desc;
/* Aggregated by Worksheet name */
SELECT wc.id ConfigID,
d.uri URI,
wsd.sheet_name WorksheetName,
wsd.data_statistic__bytes Bytes
FROM workbook_configuration wc, workbook_sheet_data wsd, data d, dap_file df, dap_job_configuration djc
WHERE d.dap_job_configuration__id = wc.id
AND d.id=wsd.workbook_data_fk
AND d.dap_job_configuration__id = djc.id
and df.id = djc.dap_file__id
order by Bytes desc;
Custom "Filter By Partitions" query for Workbooks
#Get a list of Workbooks and "Filter By Partitions" SELECT sh.workbook_fk ID, sh.name Name, sh.partition_selector PartDate FROM sheet sh WHERE sh.partition_selector LIKE '%TODAY%';
Number of Input Data Objects for Workbooks
SELECT sheet.workbook_fk, count(*) FROM data, sheet WHERE data.dap_job_configuration__id = sheet.data_source_id AND data.status = 0 AND sheet.data_source_id IS NOT NULL AND sheet.workbook_fk IN (<Configuration ID>) GROUP BY sheet.workbook_fk;
Most Frequently Executed Workbooks
SELECT configuration_id AS ConfigID, SUM(jobs_executed) + SUM(mr_jobs_executed) AS Executions FROM daily_workbook_statistics GROUP BY configuration_id ORDER BY Executions DESC LIMIT 10;
Workbooks With the Most Number of Sheets
SELECT workbook_fk as ConfigID, COUNT(id) AS Sheets FROM sheet GROUP BY workbook_fk ORDER BY Sheets DESC LIMIT 10;
Database Size Query
SELECT TABLE_NAME, table_rows, data_length, index_length, round(((data_length + index_length) / 1024 / 1024),2) 'SizeMB' FROM information_schema.TABLES WHERE table_schema = 'dap';
Consumed License Volume Per Import Job (since 2.1.4)
select job_conf_id, sum(volume) from data_volume_summary group by job_conf_id;
Consumed License Volume Per User (since 2.1.4)
select permission.owner, sum(data_volume_summary.volume) from data_volume_summary, dap_job_configuration, dap_file, permission where data_volume_summary.job_conf_id = dap_job_configuration.id and dap_job_configuration.dap_file__id = dap_file.id and dap_file.permission_fk = permission.id group by permission.owner;
List all existing artifacts and group membership
#All artifacts and group membership SELECT df.id DapFileId, df.name Name, df.extension Extension, df.permission_fk DFPermFk, gp.group_name FROM dap_file df LEFT OUTER JOIN group_permission gp ON df.permission_fk = gp.permission__id WHERE df.extension LIKE '%EXTENSION%' AND df.file_mode = 'NORMAL';
As to each artifact a data object might exist, we should gather that too.
#All artifacts with data objects and group membership SELECT df.id DapFileId, df.name Name, df.extension Extension, df.permission_fk DFPermFk, gp.group_name FROM dap_file df LEFT OUTER JOIN group_permission gp ON df.permission_fk = gp.permission__id WHERE df.extension LIKE '%EXTENSION%' AND NOT df.extension LIKE '%SYSTEM%';
List ImportJobs and Workbooks configured to keep more than 1 result
SELECT djc.id, df.name, df.extension FROM dap_file df, dap_job_configuration djc WHERE df.extension in ('IMPORT_JOB_EXTENSION','WORKBOOK_EXTENSION') AND df.file_mode = 'NORMAL' AND djc.dap_file__id = df.id AND (djc.min_keep_count is null OR djc.min_keep_count > 1);
List ImportJobs with amount of partitioned files
SELECT df1.name ArtifactName, perm.owner, djc.id ArtifactId, COUNT(d.id) Pieces, df1.extension Type, df1.last_executed LastExecuted, df1.bytes_processed, SUM(dp.data_statistic__bytes) PartBytes, SUM(dp.data_statistic__record_count) PartRecCount, SUM(dp.data_statistic__uncompressed_bytes) PartUncompBytes FROM dap_job_configuration djc INNER JOIN dap_file df1 ON djc.dap_file__id = df1.id INNER JOIN data d ON djc.id = d.dap_job_configuration__id INNER JOIN dap_file df2 ON d.dap_file__id = df2.id INNER JOIN permission perm ON df1.permission_fk = perm.id LEFT OUTER JOIN data_partition dp ON d.partition_index__id = dp.partition_index__id WHERE df1.extension = "IMPORT_JOB_EXTENSION" GROUP BY df1.name ORDER BY pieces DESC;
List owners and permissions for workbooks
SELECT wc.id WbConfigId, df.id DapFileId, df.name Name, df.extension Extension, df.permission_fk DFPermFk, perm.id PermissionId, perm.owner Owner, perm.other_permission_bits OtherPermissionBit, gp.id GroupPermId, gp.group_name GroupName, gp.permission_bits GroupPerms FROM workbook_configuration wc INNER JOIN dap_job_configuration djc on wc.id = djc.id INNER JOIN dap_file df on djc.dap_file__id = df.id INNER JOIN permission perm on wc.full_data_permission_fk = perm.id LEFT OUTER JOIN group_permission gp ON df.permission_fk = gp.permission__id;
List all scheduled jobs and its schedule
SELECT conf.`id`, file.`name`, file.`extension`, conf.`pull_type`, conf.`schedule` FROM dap_job_configuration conf JOIN dap_file file ON conf.`dap_file__id` = file.`id` WHERE conf.`pull_type` = 2;
Where the pull_type
value has the following meaning:
0: Manually
1: When new data comes in
2. Scheduled
List all Scheduled Jobs with Schedule and Owner
SELECT DISTINCT
job_owners.df_name Name,
CASE job_owners.df_extension
WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
END Type,
job_owners.user_name Owner,
djc.id ConfigID,
job_owners.uuid UUID,
job_owners.file_id FileID,
CASE djc.pull_type
WHEN 1 THEN 'When new data comes in'
WHEN 2 THEN 'Scheduled'
END PullType,
djc.schedule Schedule
FROM dap_job_configuration djc,
(
SELECT df.name df_name, df.extension df_extension, df.uuid, df.id file_id, u.name user_name
FROM dap_file df, permission p, user u
WHERE df.permission_fk = p.id
AND p.owner = u.name
AND u.name <> 'admin'
AND df.file_mode = 'NORMAL'
) job_owners
WHERE djc.dap_file__id = job_owners.file_id
AND djc.pull_type <> 0;
Find un-scheduled jobs
SELECT dap_job_configuration__id, start_time, stop_time, triggered_by, schedule, dap_file.id AS FileID, dap_file.name AS Name, dap_file.extension, dap_file.last_executed as lastExecuted, dap_file.last_changed_date as lastChanged FROM dap_job_configuration, dap_job_execution, dap_file WHERE dap_job_configuration__id = dap_job_configuration.id AND !(schedule IS NULL) AND dap_file.id = dap_file__id;
Find scheduled export jobs
select conf.`id`, file.`name`, file.`extension`, conf.`pull_type`, conf.`schedule` from dap_file file join dap_job_configuration conf ON file.`id` = conf.`dap_file__id` where (conf.pull_type = 1 or conf.pull_type = 2) and file.extension = 'EXPORT_JOB_EXTENSION';
Find jobs with no connection to others
SELECT name ArtifactName, CASE extension WHEN 'DATA_STORE_EXTENSION' THEN 'Connection' WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link' WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job' WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook' WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job' END Type, last_changed_date lastChanged, last_executed lastExecuted, id FROM dap_file WHERE extension NOT IN ('SYSTEM_JOB_EXTENSION', 'SYSTEM_JOB_DATA_EXTENSION', 'JPG', 'PNG', 'WORKBOOK_DATA_EXTENSION', 'IMPORT_JOB_DATA_EXTENSION', 'EXPORT_JOB_DATA_EXTENSION') AND id NOT IN (SELECT dependency_file_id FROM file_dependency) AND id NOT IN (SELECT source_file_id FROM file_dependency) AND folder_fk != (SELECT id FROM folder WHERE name = '.system');
Find jobs not executed in the last days
SELECT name ArtifactName, CASE extension WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link' WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job' WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook' WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job' END Type, last_changed_date lastChanged, last_executed lastExecuted, schedule Schedule FROM dap_file, dap_job_configuration as job_conf WHERE job_conf.dap_file__id=dap_file.id AND schedule IS NOT null AND last_executed <= ADDDATE(CURRENT_DATE(), 1) AND last_executed >= SUBDATE(CURRENT_DATE(), 10);
List all referencing workbooks that prevent other workbook datas from being deleted
If you see that a workbook data with ID <xyz> is not getting deleted, this usually is because other ACTIVE workbooks are referencing it. You can find those reference workbooks with the following query:
SELECT data.id 'Ref By Data ID', data.status 'Status', data.dap_job_configuration__id 'Workbook ID', workbook_sheet_data.id 'Ref By Sheet Data ID', workbook_sheet_data.sheet_name 'Sheet Name' FROM data, workbook_sheet_data, workbook_sheet_external_data WHERE data.id = workbook_sheet_data.workbook_data_fk AND workbook_sheet_data.id = workbook_sheet_data_id AND referenced_workbook_sheet_data_id IN (select workbook_sheet_data.id from workbook_sheet_data, data where workbook_sheet_data.workbook_data_fk = data.id and data.id = xyz);
You can now do the following:
- Delete the referencing workbook data from the UI (in case it is not needed anymore).
- Set the keep flag of the referencing sheet to false so that this doesn't happen in the future.
Files shared to the world
### Sharing for "Others" enabled on Import Jobs, Data Links, File Uploads, Workbooks, and Export Jobs SELECT djc.id, df.name Name, df.extension Type, p.owner Owner, p.other_permission_bits Others FROM dap_file df, dap_job_configuration djc, permission p WHERE p.other_permission_bits != '___' AND df.file_mode = 'NORMAL' AND df.permission_fk = p.id AND djc.dap_file__id = df.id; ### Infographics "other sharing enabled" SELECT im.id, df.name Name, p.owner Owner, p.other_permission_bits others_sharing FROM dap_file df, infographic_model im, permission p WHERE p.other_permission_bits != '___' AND df.file_mode = 'NORMAL' AND df.permission_fk = p.id AND im.dap_file__id = df.id; ### Workbooks sharing full data results SELECT wc.id, df.name AS WorkbookName, p.owner AS Owner, p.other_permission_bits AS Permissions FROM workbook_configuration wc INNER JOIN dap_job_configuration djc ON wc.id = djc.id INNER JOIN dap_file df ON djc.dap_file__id = df.id INNER JOIN permission p ON wc.full_data_permission_fk = p.id WHERE df.file_mode = 'NORMAL' AND p.other_permission_bits != '___'; ### Public Infopgraphics SELECT im.id, df.name Name, p.owner Owner FROM dap_file df, infographic_model im, permission p WHERE p.shared != 0 AND df.file_mode = 'NORMAL' AND df.permission_fk = p.id AND im.dap_file__id = df.id;
Amount of Bytes for each executed artifact
(Don't look at dap_job_execution table because it gets cleaned up every 28d by housekeeping.)
select df1.name ArtifactName, djc.id ArtifactId, df2.name DataName, d.id DataId, df1.extension Type, df1.last_executed LastExecuted, djc.data_volume_size_by_license_period dataVolLicPer, df1.bytes_processed, sum(dp.data_statistic__bytes) PartBytes, sum(dp.data_statistic__record_count) PartRecCount, sum(dp.data_statistic__uncompressed_bytes) PartUncompBytes from dap_job_configuration djc inner join dap_file df1 on djc.dap_file__id = df1.id inner join data d on djc.id = d.dap_job_configuration__id inner join dap_file df2 on d.dap_file__id = df2.id left outer join data_partition dp on d.partition_index__id = dp.partition_index__id GROUP BY df2.id order by djc.id; # group by df2.id will sum up the partition based on dataId # group by df1.id will sum up based on artifactId
Workbooks attached to dashboards
Workbook info and last execution details that are attached to widgets for old dashboards. Use this query to figure out if there are upgrade risks for customers that were in 1.x as dashboards are no longer supported for view in 4.3 +
select djc.*, df.*, u.name as owner, u.email, u.login_date as lastLoginDate from dap_job_configuration djc, dap_file df, widget w, sheet s, workbook_configuration wc, permission p, user u where w.sheet_fk = s.id and s.workbook_fk = wc.id and djc.id = wc.id and df.id = djc.dap_file__id and p.id = df.permission_fk and u.name = p.owner;
All exports to files and databases and their locations and connections
SELECT exportjob.name as exportjob, f.name as connectionName, dsc.connection_fk,coalesce(dscpHDFS.property_value, dscpHive.property_value) databaseOrPath, coalesce(file_name, table_name) target
FROM data_sink_configuration dsc
left outer join file_data_sink fds on fds.id=dsc.id
left outer join data_base_data_sink dbds on dbds.id=dsc.id
inner join data_store ds on ds.id=dsc.connection_fk
left outer join data_store_configuration_property dscpHDFS on dscpHDFS.configuration_fk = ds.id and dscpHDFS.property_key='rootPathPrefix'
left outer join data_store_configuration_property dscpHive on dscpHive.configuration_fk = ds.id and dscpHive.property_key='databaseFilter'
inner join dap_file f on f.id=ds.dap_file__id
inner join dap_job_configuration job on job.id=dsc.id
inner join dap_file exportJob on exportJob.id=job.dap_file__id;
Space Consuming Jobs
SELECT df.name 'Job',
CASE df.extension
WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
END Type,
d.dap_job_configuration__id ConfigID,p.owner Owner,d.uri URI,
round(df.bytes_processed/1024/1024/1024,2) 'Size in GB',
d.effective_date 'Effective Date',d.id 'Data Id'
FROM data d, dap_file df , permission p, dap_job_configuration djc
WHERE
df.extension in ('IMPORT_JOB_EXTENSION','WORKBOOK_EXTENSION','EXPORT_JOB_EXTENSION','IMPORT_LINK_JOB_EXTENSION')
AND djc.dap_file__id = df.id AND djc.id = d.dap_job_configuration__id
AND p.id = df.permission_fk
order by df.bytes_processed desc;
List the owner of all artifacts together with configID, fileID and UUID and filtered by user if necessary.
SELECT df.name ArtifactName,
CASE df.extension
WHEN 'UPLOAD_JOB_EXTENSION' THEN 'File Upload'
WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
END Type,
perm.owner Owner,
djc.id ConfigID,
djc.dap_file__id FileID,
df.uuid UUID
FROM dap_file df,
dap_job_configuration djc,
permission perm
WHERE df.extension IN ('UPLOAD_JOB_EXTENSION',
'IMPORT_LINK_JOB_EXTENSION',
'IMPORT_JOB_EXTENSION',
'WORKBOOK_EXTENSION',
'EXPORT_JOB_EXTENSION')
AND df.file_mode = 'NORMAL'
AND djc.dap_file__id = df.id
AND perm.id = df.permission_fk;
/* AND perm.owner LIKE '%<user>%' */
Get the file name, fileID and UUID for each file in a specific folder
SELECT * FROM
(SELECT concat('/',GetAncestry(df.folder_fk),'/') AS PATH,
df.name ArtifactName,
CASE df.extension
WHEN 'UPLOAD_JOB_EXTENSION' THEN 'File Upload'
WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
END Type,
p.owner AS Owner,
p.id AS PermissionID,
djc.id AS ConfigID,
djc.dap_file__id AS FileID,
df.uuid AS UUID
FROM permission p,
dap_job_configuration djc,
dap_file df
WHERE p.id = df.permission_fk
AND df.id = djc.dap_file__id) A
WHERE lower(A.PATH) LIKE '/%'
ORDER BY A.OWNER, A.PATH;
SQL statement which delivers a ls -Ral *.wbk as an output
SELECT
concat('/',GetAncestry(df.folder_fk),'/') AS Path,
df.name Name,
CASE df.extension
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
END Type,
perm.owner Owner,
gp.group_name "Group",
'RWX' AS Perms,
gp.permission_bits GroupPerms,
perm.other_permission_bits Others,
wc.id ConfigID,
df.id FileID,
df.uuid UUID,
df.bytes_processed Bytes,
df.last_changed_date Changed,
df.last_executed Executed
FROM workbook_configuration wc
INNER JOIN dap_job_configuration djc ON wc.id = djc.id
INNER JOIN dap_file df ON djc.dap_file__id = df.id
/*
As of v6.1 wc.data_permission_fk
*/
INNER JOIN permission perm ON wc.full_data_permission_fk = perm.id
LEFT OUTER JOIN group_permission gp ON df.permission_fk = gp.permission__id;
Top artifacts using data volume in the current term (since <timestamp>)
SELECT t1.job_conf_id AS ConfigID,
t1.GB,
df.name ArtifactName,
CASE df.extension
WHEN 'UPLOAD_JOB_EXTENSION' THEN 'File Upload'
WHEN 'IMPORT_LINK_JOB_EXTENSION' THEN 'Data Link'
WHEN 'IMPORT_JOB_EXTENSION' THEN 'Import Job'
WHEN 'WORKBOOK_EXTENSION' THEN 'Workbook'
WHEN 'EXPORT_JOB_EXTENSION' THEN 'Export Job'
END Type
FROM (
SELECT job_conf_id,
round(sum(volume)/1024/1024/1024,2) AS GB
FROM data_volume_summary /* WHERE day >= "1970-01-21 00:00:00" */
GROUP BY job_conf_id
ORDER BY GB DESC /* LIMIT 20 */
)
t1
JOIN dap_job_configuration djc ON djc.id = t1.job_conf_id
JOIN dap_file df ON df.id = djc.dap_file__id;
Comments
0 comments
Please sign in to leave a comment.