The client of BeeGFS is composed of one kernel module and two system services. Here we mainly analyze the kernel module. The kernel module mainly implements a Linux file system, so it registers a file system type. Because the directory tree resolution of BeeGFS is to find the subdirectory or file DEntry in the parent directory DEntry, and then iterate step by step. Therefore, in the Mount file system, you need to obtain the ID of the root metadata node from the management node, and then query the information of the root directory DEntry from the root metadata node to lay the foundation for the subsequent directory resolution.
Register file system type
init_fhgfs_client
- Kernel module initialization:
// fhgfs_client_module\source\program\Main.c #define BEEGFS_LICENSE "GPL v2" static int __init init_fhgfs_client(void) { #define fail_to(target, msg) \ do { \ printk_fhgfs(KERN_WARNING, msg "\n"); \ goto target; \ } while (0) if (!beegfs_fault_inject_init() ) fail_to(fail_fault, "could not register fault-injection debugfs dentry"); if (!beegfs_native_init() ) fail_to(fail_native, "could not allocate emergency pools"); if (!FhgfsOpsCommKit_initEmergencyPools() ) fail_to(fail_commkitpools, "could not allocate emergency pools"); if (!SocketTk_initOnce() ) fail_to(fail_socket, "SocketTk initialization failed"); if (!FhgfsOps_initInodeCache() ) fail_to(fail_inode, "Inode cache initialization failed"); if (!RWPagesWork_initworkQueue() ) fail_to(fail_rwpages, "Page work queue registration failed"); if (!FhgfsOpsRemoting_initMsgBufCache() ) fail_to(fail_msgbuf, "Message cache initialization failed"); if (!FhgfsOpsPages_initPageListVecCache() ) fail_to(fail_pagelists, "PageVec cache initialization failed"); if (FhgfsOps_registerFilesystem() ) fail_to(fail_register, "File system registration failed"); ProcFs_createGeneralDir(); printk_fhgfs(KERN_INFO, "File system registered. Type: %s. Version: %s\n", BEEGFS_MODULE_NAME_STR, App_getVersionStr() ); return 0; fail_register: FhgfsOpsPages_destroyPageListVecCache(); fail_pagelists: FhgfsOpsRemoting_destroyMsgBufCache(); fail_msgbuf: RWPagesWork_destroyWorkQueue(); fail_rwpages: FhgfsOps_destroyInodeCache(); fail_inode: SocketTk_uninitOnce(); fail_socket: FhgfsOpsCommKit_releaseEmergencyPools(); fail_commkitpools: beegfs_native_release(); fail_native: beegfs_fault_inject_release(); fail_fault: return -EPERM; } static void __exit exit_fhgfs_client(void) { ProcFs_removeGeneralDir(); BUG_ON(FhgfsOps_unregisterFilesystem() ); FhgfsOpsPages_destroyPageListVecCache(); FhgfsOpsRemoting_destroyMsgBufCache(); RWPagesWork_destroyWorkQueue(); FhgfsOps_destroyInodeCache(); SocketTk_uninitOnce(); FhgfsOpsCommKit_releaseEmergencyPools(); beegfs_native_release(); beegfs_fault_inject_release(); printk_fhgfs(KERN_INFO, "BeeGFS client unloaded.\n"); } module_init(init_fhgfs_client) module_exit(exit_fhgfs_client) MODULE_LICENSE(BEEGFS_LICENSE); MODULE_DESCRIPTION("BeeGFS parallel file system client (http://www.beegfs.com)"); MODULE_AUTHOR("Fraunhofer ITWM, CC-HPC");
FhgfsOps_registerFilesystem
- At initialization, register the BeeGFS file system type with the kernel:
// fhgfs_client_module\source\filesystem\FhgfsOpsSuper.c static struct file_system_type fhgfs_fs_type = { .name = BEEGFS_MODULE_NAME_STR, .owner = THIS_MODULE, .kill_sb = FhgfsOps_killSB, //.fs_flags = FS_BINARY_MOUNTDATA, // not required currently #ifdef KERNEL_HAS_GET_SB_NODEV .get_sb = FhgfsOps_getSB, #else .mount = FhgfsOps_mount, // basically the same thing as get_sb before #endif }; int FhgfsOps_registerFilesystem(void) { return register_filesystem(&fhgfs_fs_type); }
Mount file system
FhgfsOps_mount
- When mounting a file system, fhgfsops? Fillsuper is called indirectly to fill the file system superblock.
// fhgfs_client_module\source\filesystem\FhgfsOps_versions.c #ifdef KERNEL_HAS_GET_SB_NODEV int FhgfsOps_getSB(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { return get_sb_nodev(fs_type, flags, data, FhgfsOps_fillSuper, mnt); } #else /* kernel 2.6.39 switched from get_sb() to mount(), which provides similar functionality from our point of view. */ struct dentry* FhgfsOps_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_nodev(fs_type, flags, data, FhgfsOps_fillSuper); } #endif // LINUX_VERSION_CODE
FhgfsOps_fillSuper
- Initialize the superblock of the file system instance and the inode of the root directory. At this time, the ID is simply initialized to 0, which will be updated to the real id later:
// fhgfs_client_module\source\filesystem\FhgfsOpsSuper.c /** * Fill the file system superblock (vfs object) */ int FhgfsOps_fillSuper(struct super_block* sb, void* rawMountOptions, int silent) { App* app = NULL; Config* cfg = NULL; struct inode* rootInode; struct dentry* rootDentry; struct kstat kstat; EntryInfo entryInfo; FhgfsIsizeHints iSizeHints; // init per-mount app object if(__FhgfsOps_constructFsInfo(sb, rawMountOptions) ) return -ECANCELED; app = FhgfsOps_getApp(sb); cfg = App_getConfig(app); // set up super block data sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = BEEGFS_MAGIC; sb->s_op = &fhgfs_super_ops; sb->s_time_gran = 1000000000; // granularity of c/m/atime in ns sb->s_flags |= MS_NODIRATIME; if (Config_getSysXAttrsEnabled(cfg ) ) sb->s_xattr = fhgfs_xattr_handlers_noacl; // handle only user xattrs #ifdef KERNEL_HAS_POSIX_GET_ACL if (Config_getSysACLsEnabled(cfg) ) { sb->s_xattr = fhgfs_xattr_handlers; // replace with acl-capable xattr handlers sb->s_flags |= MS_POSIXACL; } #endif // KERNEL_HAS_POSIX_GET_ACL /* MS_ACTIVE is rather important as it marks the super block being successfully initialized and * allows the vfs to keep important inodes in the cache. However, it seems it is already * initialized in vfs generic mount functions. sb->s_flags |= MS_ACTIVE; // used in iput_final() */ // NFS kernel export is probably not worth the backport efforts for kernels before 2.6.29 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29) sb->s_export_op = &fhgfs_export_ops; #endif #if defined(KERNEL_HAS_SB_BDI) sb->s_bdi = FhgfsOps_getBdi(sb); #endif // init root inode memset(&kstat, 0, sizeof(struct kstat) ); kstat.ino = BEEGFS_INODE_ROOT_INO; kstat.mode = S_IFDIR | 0777; // allow access for everyone kstat.atime = kstat.mtime = kstat.ctime = current_fs_time(sb); kstat.uid = FhgfsCommon_getCurrentKernelUserID(); kstat.gid = FhgfsCommon_getCurrentKernelGroupID(); kstat.blksize = Config_getTuneInodeBlockSize(cfg); kstat.nlink = 1; // root entryInfo is always updated when someone asks for it (so we just set dummy values here) EntryInfo_init(&entryInfo, NodeOrGroup_fromGroup(0), StringTk_strDup(""), StringTk_strDup(""), StringTk_strDup(""), DirEntryType_DIRECTORY, 0); rootInode = __FhgfsOps_newInode(sb, &kstat, 0, &entryInfo, &iSizeHints); if(!rootInode || IS_ERR(rootInode) ) { __FhgfsOps_destructFsInfo(sb); return IS_ERR(rootInode) ? PTR_ERR(rootInode) : -ENOMEM; } rootDentry = d_make_root(rootInode); if(!rootDentry) { __FhgfsOps_destructFsInfo(sb); return -ENOMEM; } #ifdef KERNEL_HAS_S_D_OP // linux 2.6.38 switched from individual per-dentry to defaul superblock d_ops. /* note: Only set default dentry operations here, as we don't want those OPs set for the root * dentry. In fact, setting as before would only slow down everything a bit, due to * useless revalidation of our root dentry. */ sb->s_d_op = &fhgfs_dentry_ops; #endif // KERNEL_HAS_S_D_OP rootDentry->d_time = jiffies; sb->s_root = rootDentry; return 0; }
Initialize file system
__FhgfsOps_constructFsInfo
- Apply for memory and construct basic data structure of file system:
// fhgfs_client_module\source\filesystem\FhgfsOpsSuper.c /** * Initialize sb->s_fs_info * * @return 0 on success, negative linux error code otherwise */ int __FhgfsOps_constructFsInfo(struct super_block* sb, void* rawMountOptions) { int res; int appRes; App* app; Logger* log; #if defined(KERNEL_HAS_SB_BDI) && !defined(KERNEL_HAS_SUPER_SETUP_BDI_NAME) struct backing_dev_info* bdi; #endif // use kzalloc to also zero the bdi FhgfsSuperBlockInfo* sbInfo = kzalloc(sizeof(FhgfsSuperBlockInfo), GFP_KERNEL); if (!sbInfo) { printk_fhgfs_debug(KERN_INFO, "Failed to allocate memory for FhgfsSuperBlockInfo"); sb->s_fs_info = NULL; return -ENOMEM; } sb->s_fs_info = sbInfo; appRes = __FhgfsOps_initApp(sb, rawMountOptions); if(appRes) { printk_fhgfs_debug(KERN_INFO, "Failed to initialize App object"); res = -EINVAL; goto outFreeSB; } app = FhgfsOps_getApp(sb); log = App_getLogger(app); IGNORE_UNUSED_VARIABLE(log); #if defined(KERNEL_HAS_SB_BDI) #if defined(KERNEL_HAS_SUPER_SETUP_BDI_NAME) && !defined(KERNEL_HAS_BDI_SETUP_AND_REGISTER) { static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); res = super_setup_bdi_name(sb, BEEGFS_MODULE_NAME_STR "-%ld", atomic_long_inc_return(&bdi_seq)); } #else bdi = &sbInfo->bdi; /* NOTE: The kernel expects a fully initialized bdi structure, so at a minimum it has to be * allocated by kzalloc() or memset(bdi, 0, sizeof(*bdi)). * we don't set the congest_* callbacks (like every other filesystem) because those are * intended for dm and md. */ bdi->ra_pages = BEEGFS_DEFAULT_READAHEAD_PAGES; #if defined(KERNEL_HAS_BDI_CAP_MAP_COPY) res = bdi_setup_and_register(bdi, BEEGFS_MODULE_NAME_STR, BDI_CAP_MAP_COPY); #else res = bdi_setup_and_register(bdi, BEEGFS_MODULE_NAME_STR); #endif #endif if (res) { Logger_logFormatted(log, 2, __func__, "Failed to init super-block (bdi) information: %d", res); __FhgfsOps_uninitApp(app); goto outFreeSB; } #endif // set root inode attribs to uninit'ed FhgfsOps_setHasRootEntryInfo(sb, false); FhgfsOps_setIsRootInited(sb, false); printk_fhgfs(KERN_INFO, "BeeGFS mount ready.\n"); return 0; // all ok, res should be 0 here outFreeSB: kfree(sbInfo); sb->s_fs_info = NULL; return res; }
__FhgfsOps_initApp
- Parse the parameters and continue initialization:
// fhgfs_client_module\source\filesystem\FhgfsOpsSuper.c /** * Creates and initializes the per-mount application object. */ int __FhgfsOps_initApp(struct super_block* sb, char* rawMountOptions) { MountConfig* mountConfig; bool parseRes; App* app; int appRes; // create mountConfig (parse from mount options) mountConfig = MountConfig_construct(); parseRes = MountConfig_parseFromRawOptions(mountConfig, rawMountOptions); if(!parseRes) { MountConfig_destruct(mountConfig); return APPCODE_INVALID_CONFIG; } //printk_fhgfs(KERN_INFO, "Initializing App...\n"); // debug in app = FhgfsOps_getApp(sb); App_init(app, mountConfig); appRes = App_run(app); if(appRes != APPCODE_NO_ERROR) { // error occurred => clean up printk_fhgfs_debug(KERN_INFO, "Stopping App...\n"); App_stop(app); printk_fhgfs_debug(KERN_INFO, "Cleaning up...\n"); App_uninit(app); printk_fhgfs_debug(KERN_INFO, "App unitialized.\n"); return appRes; } ProcFs_createEntries(app); return appRes; }
App_run
- Initialize the basic components of the client:
// fhgfs_client_module\source\app\App.c int App_run(App* this) { // init data objects & storage if(!__App_initDataObjects(this, this->mountConfig) ) { printk_fhgfs(KERN_WARNING, "Configuration error: Initialization of common objects failed. " "(Log file may provide additional information.)\n"); this->appResult = APPCODE_INVALID_CONFIG; return this->appResult; } if(!__App_initInodeOperations(this) ) { printk_fhgfs(KERN_WARNING, "Initialization of inode operations failed."); this->appResult = APPCODE_INITIALIZATION_ERROR; return this->appResult; } if(!__App_initStorage(this) ) { printk_fhgfs(KERN_WARNING, "Configuration error: Initialization of storage failed\n"); this->appResult = APPCODE_INVALID_CONFIG; return this->appResult; } // init components if(!__App_initComponents(this) ) { printk_fhgfs(KERN_WARNING, "Component initialization error. " "(Log file may provide additional information.)\n"); this->appResult = APPCODE_INITIALIZATION_ERROR; return this->appResult; } __App_logInfos(this); // start components __App_startComponents(this); // Note: We wait some ms for the node downloads here because the kernel would like to // check the properties of the root directory directly after mount. InternodeSyncer_waitForMgmtInit(this->internodeSyncer, 1000); if(!__App_mountServerCheck(this) ) { // mount check failed => cancel mount printk_fhgfs(KERN_WARNING, "Mount sanity check failed. Canceling mount. " "(Log file may provide additional information. Check can be disabled with " "sysMountSanityCheckMS=0 in the config file.)\n"); this->appResult = APPCODE_INITIALIZATION_ERROR; return this->appResult; } // mark: mount succeeded if we got here! return this->appResult; }
__App_initInodeOperations
- Initialize the basic operation of inode for later creation of new inode:
// fhgfs_client_module\source\app\App.c /** * Initialized the inode_operations structs depending on what features have been enabled in * the config. */ bool __App_initInodeOperations(App* this) { Config* cfg = App_getConfig(this); this->fileInodeOps = os_kzalloc(sizeof(struct inode_operations) ); this->symlinkInodeOps = os_kzalloc(sizeof(struct inode_operations) ); this->dirInodeOps = os_kzalloc(sizeof(struct inode_operations) ); this->specialInodeOps = os_kzalloc(sizeof(struct inode_operations) ); if (!this->fileInodeOps || !this->symlinkInodeOps || !this->dirInodeOps || !this->specialInodeOps) { SAFE_KFREE(this->fileInodeOps); SAFE_KFREE(this->symlinkInodeOps); SAFE_KFREE(this->dirInodeOps); SAFE_KFREE(this->specialInodeOps); return false; } this->fileInodeOps->getattr = FhgfsOps_getattr; this->fileInodeOps->permission = FhgfsOps_permission; this->fileInodeOps->setattr = FhgfsOps_setattr; #ifdef KERNEL_HAS_GENERIC_READLINK this->symlinkInodeOps->readlink = generic_readlink; // default is fine for us currently #endif #ifdef KERNEL_HAS_GET_LINK this->symlinkInodeOps->get_link = FhgfsOps_get_link; #else this->symlinkInodeOps->follow_link = FhgfsOps_follow_link; this->symlinkInodeOps->put_link = FhgfsOps_put_link; #endif this->symlinkInodeOps->getattr = FhgfsOps_getattr; this->symlinkInodeOps->permission = FhgfsOps_permission; this->symlinkInodeOps->setattr = FhgfsOps_setattr; #ifdef KERNEL_HAS_ATOMIC_OPEN #ifdef BEEGFS_ENABLE_ATOMIC_OPEN this->dirInodeOps->atomic_open = FhgfsOps_atomicOpen; #endif // BEEGFS_ENABLE_ATOMIC_OPEN #endif this->dirInodeOps->lookup = FhgfsOps_lookupIntent; this->dirInodeOps->create = FhgfsOps_createIntent; this->dirInodeOps->link = FhgfsOps_link; this->dirInodeOps->unlink = FhgfsOps_unlink; this->dirInodeOps->mknod = FhgfsOps_mknod; this->dirInodeOps->symlink = FhgfsOps_symlink; this->dirInodeOps->mkdir = FhgfsOps_mkdir; this->dirInodeOps->rmdir = FhgfsOps_rmdir; this->dirInodeOps->rename = FhgfsOps_rename; this->dirInodeOps->getattr = FhgfsOps_getattr; this->dirInodeOps->permission = FhgfsOps_permission; this->dirInodeOps->setattr = FhgfsOps_setattr; this->specialInodeOps->setattr = FhgfsOps_setattr; if (Config_getSysXAttrsEnabled(cfg) ) { this->fileInodeOps->listxattr = FhgfsOps_listxattr; this->dirInodeOps->listxattr = FhgfsOps_listxattr; #ifdef KERNEL_HAS_GENERIC_GETXATTR this->fileInodeOps->getxattr = generic_getxattr; this->fileInodeOps->removexattr = FhgfsOps_removexattr; this->fileInodeOps->setxattr = generic_setxattr; this->dirInodeOps->getxattr = generic_getxattr; this->dirInodeOps->removexattr = FhgfsOps_removexattr; this->dirInodeOps->setxattr = generic_setxattr; #endif if (Config_getSysACLsEnabled(cfg) ) { #ifdef KERNEL_HAS_POSIX_GET_ACL this->fileInodeOps->get_acl = FhgfsOps_get_acl; this->dirInodeOps->get_acl = FhgfsOps_get_acl; // Note: symlinks don't have ACLs #ifdef KERNEL_HAS_SET_ACL this->fileInodeOps->set_acl = FhgfsOps_set_acl; this->dirInodeOps->set_acl = FhgfsOps_set_acl; #endif // LINUX_VERSION_CODE #else Logger_logErr(this->logger, "Init inode operations", "ACLs activated in config, but not supported on this kernel version."); return false; #endif // KERNEL_HAS_POSIX_GET_ACL } } return true; }
Create and initialize Inode
The call is initiated by the fhgfsops? Fillsuper function.
__FhgfsOps_newInode
- When creating a new Inode, this function will be called to access the corresponding metadata node according to the entry information of the parent directory (including the metadata node ID where the parent directory is located and the directory ID) for the operation of the subdirectory or file:
// fhgfs_client_module\source\filesystem\FhgfsOpsInode.h /** * See __FhgfsOps_newInodeWithParentID for details. This is just a wrapper function. */ struct inode* __FhgfsOps_newInode(struct super_block* sb, struct kstat* kstat, dev_t dev, EntryInfo* entryInfo, FhgfsIsizeHints* iSizeHints) { return __FhgfsOps_newInodeWithParentID(sb, kstat, dev, entryInfo, (NumNodeID){0}, iSizeHints); } /** * Creates a new inode, inits it from the kstat, inits the ops (depending on the mode) * and hashes it. * * Note: Make sure everything is set in the kstat _before_ you call this, because we hash * the inode in here (so it can be found and accessed by others when this method returns). * Note: Consider using the _instantiateInode()-wrapper instead of calling this directly for new * files/dirs. * * @param kstat must have a valid .ino (inode number) * @param dev set to 0 if not required (only used for special files) * @param entryInfoPtr contained strings will just be moved to the new inode or free'd in case of an * error (or cached inode), so don't access the given entryInfoPtr anymore after calling this. * @param parentNodeID: usually 0, except for NFS export callers, which needs it to connect dentries * with their parents. By default dentries are connected to their parents, so usually this * is not required (nfs is an exception). * @return NULL if not successful */ struct inode* __FhgfsOps_newInodeWithParentID(struct super_block* sb, struct kstat* kstat, dev_t dev, EntryInfo* entryInfo, NumNodeID parentNodeID, FhgfsIsizeHints* iSizeHints) { App* app = FhgfsOps_getApp(sb); Config* cfg = App_getConfig(app); FhgfsInode* fhgfsInode; FhgfsInodeComparisonInfo comparisonInfo = { .inodeHash = kstat->ino, // pre-set by caller .entryID = entryInfo->entryID, }; // check inode cache for an existing inode with this ID (and get it) or allocate a new one struct inode* inode = iget5_locked(sb, kstat->ino, __FhgfsOps_compareInodeID, __FhgfsOps_initNewInodeDummy, &comparisonInfo); if(unlikely(!inode || IS_ERR(inode) ) ) goto cleanup_entryInfo; // allocation of new inode failed fhgfsInode = BEEGFS_INODE(inode); if( !(inode->i_state & I_NEW) ) { // Found an existing inode, which is possibly actively used. We still need to update it. FhgfsInode_entryInfoWriteLock(fhgfsInode); // LOCK EntryInfo FhgfsInode_updateEntryInfoUnlocked(fhgfsInode, entryInfo); FhgfsInode_entryInfoWriteUnlock(fhgfsInode); // UNLOCK EntryInfo spin_lock(&inode->i_lock); __FhgfsOps_applyStatDataToInodeUnlocked(kstat, iSizeHints, inode); // already locked Time_setToNow(&fhgfsInode->dataCacheTime); spin_unlock(&inode->i_lock); goto outNoCleanUp; // we found a matching existing inode => no init needed } fhgfsInode->parentNodeID = parentNodeID; /* note: new inodes are protected by the I_NEW flag from access by other threads until we * call unlock_new_inode(). */ // init this fresh new inode... // no one can access inode yet => unlocked __FhgfsOps_applyStatDataToInodeUnlocked(kstat, iSizeHints, inode); inode->i_ino = kstat->ino; // pre-set by caller inode->i_flags |= S_NOATIME | S_NOCMTIME; // timestamps updated by server mapping_set_gfp_mask(&inode->i_data, GFP_USER); // avoid highmem for page cache pages // move values (no actual string copy) fhgfsInode->entryInfo = *entryInfo; switch (kstat->mode & S_IFMT) { case S_IFREG: // regular file { if(Config_getTuneFileCacheTypeNum(cfg) == FILECACHETYPE_Native) { inode->i_fop = &fhgfs_file_native_ops; inode->i_data.a_ops = &fhgfs_addrspace_native_ops; } else if(Config_getTuneFileCacheTypeNum(cfg) == FILECACHETYPE_Paged) { // with pagecache inode->i_fop = &fhgfs_file_pagecache_ops; inode->i_data.a_ops = &fhgfs_address_pagecache_ops; } else { // no pagecache (=> either none or buffered cache) inode->i_fop = &fhgfs_file_buffered_ops; inode->i_data.a_ops = &fhgfs_address_ops; } #ifdef KERNEL_HAS_ADDRESS_SPACE_BDI inode->i_data.backing_dev_info = FhgfsOps_getBdi(sb); #endif inode->i_op = App_getFileInodeOps(app); } break; case S_IFDIR: // directory { inode->i_op = App_getDirInodeOps(app); inode->i_fop = &fhgfs_dir_ops; } break; case S_IFLNK: // symlink { inode->i_op = App_getSymlinkInodeOps(app); } break; default: // pipes and other special files { inode->i_op = App_getSpecialInodeOps(app); init_special_inode(inode, kstat->mode, dev); } break; } unlock_new_inode(inode); // remove I_NEW flag, so the inode can be accessed by others return inode; // error occured cleanup_entryInfo: EntryInfo_uninit(entryInfo); // found an existing inode outNoCleanUp: return inode; }
App_getFileInodeOps
- Finally, according to the file type, give the previously initialized inode operation pointer:
// fhgfs_client_module\source\app\App.h struct inode_operations* App_getFileInodeOps(App* this) { return this->fileInodeOps; } struct inode_operations* App_getSymlinkInodeOps(App* this) { return this->symlinkInodeOps; } struct inode_operations* App_getDirInodeOps(App* this) { return this->dirInodeOps; } struct inode_operations* App_getSpecialInodeOps(App* this) { return this->specialInodeOps; }