Xen project Mailing List

[Minios-devel] [UNIKRAFT PATCH 04/22] lib/vfscore: Initial import of OSv vfs

From: Yuri Volchkov <yuri.volchkov@xxxxxxxxx>

Date: Thu, 31 Jan 2019 16:04:59 +0100

Delivery-date: Thu, 31 Jan 2019 15:05:38 +0000

List-id: Mini-os development list <minios-devel.lists.xenproject.org>

The code is imported as is. Commit f1f42915a33bebe120e70af1f32c1a4d92bac780 Signed-off-by: Yuri Volchkov <yuri.volchkov@xxxxxxxxx> --- lib/vfscore/dentry.c | 234 +++ lib/vfscore/fops.c | 189 ++ lib/vfscore/include/vfscore/dentry.h | 45 + lib/vfscore/include/vfscore/mount.h | 171 ++ lib/vfscore/include/vfscore/prex.h | 34 + lib/vfscore/include/vfscore/uio.h | 89 + lib/vfscore/include/vfscore/vnode.h | 246 +++ lib/vfscore/lookup.c | 375 ++++ lib/vfscore/main.c | 2413 ++++++++++++++++++++++++++ lib/vfscore/mount.c | 491 ++++++ lib/vfscore/subr_uio.c | 73 + lib/vfscore/syscalls.c | 1486 ++++++++++++++++ lib/vfscore/task.c | 167 ++ lib/vfscore/vfs.h | 189 ++ lib/vfscore/vnode.c | 522 ++++++ 15 files changed, 6724 insertions(+) create mode 100644 lib/vfscore/dentry.c create mode 100644 lib/vfscore/fops.c create mode 100644 lib/vfscore/include/vfscore/dentry.h create mode 100644 lib/vfscore/include/vfscore/mount.h create mode 100644 lib/vfscore/include/vfscore/prex.h create mode 100644 lib/vfscore/include/vfscore/uio.h create mode 100644 lib/vfscore/include/vfscore/vnode.h create mode 100644 lib/vfscore/lookup.c create mode 100644 lib/vfscore/main.c create mode 100644 lib/vfscore/mount.c create mode 100644 lib/vfscore/subr_uio.c create mode 100644 lib/vfscore/syscalls.c create mode 100644 lib/vfscore/task.c create mode 100644 lib/vfscore/vfs.h create mode 100644 lib/vfscore/vnode.c diff --git a/lib/vfscore/dentry.c b/lib/vfscore/dentry.c new file mode 100644 index 00000000..facd9eaa --- /dev/null +++ b/lib/vfscore/dentry.c @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +/* + * Copyright (c) 2005-2007, Kohsuke Ohtani + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <string.h> +#include <stdlib.h> +#include <sys/param.h> + +#include <osv/dentry.h> +#include <osv/vnode.h> +#include "vfs.h" + +#define DENTRY_BUCKETS 32 + +static LIST_HEAD(dentry_hash_head, dentry) dentry_hash_table[DENTRY_BUCKETS]; +static LIST_HEAD(fake, dentry) fake; +static mutex dentry_hash_lock; + +/* + * Get the hash value from the mount point and path name. + * XXX: replace with a better hash for 64-bit pointers. + */ +static u_int +dentry_hash(struct mount *mp, const char *path) +{ + u_int val = 0; + + if (path) { + while (*path) { + val = ((val << 5) + val) + *path++; + } + } + return (val ^ (unsigned long) mp) & (DENTRY_BUCKETS - 1); +} + + +struct dentry * +dentry_alloc(struct dentry *parent_dp, struct vnode *vp, const char *path) +{ + struct mount *mp = vp->v_mount; + struct dentry *dp = (dentry*)calloc(sizeof(*dp), 1); + + if (!dp) { + return nullptr; + } + + vref(vp); + + dp->d_refcnt = 1; + dp->d_vnode = vp; + dp->d_mount = mp; + dp->d_path = strdup(path); + LIST_INIT(&dp->d_children); + + if (parent_dp) { + dref(parent_dp); + WITH_LOCK(parent_dp->d_lock) { + // Insert dp into its parent's children list. + LIST_INSERT_HEAD(&parent_dp->d_children, dp, d_children_link); + } + } + dp->d_parent = parent_dp; + + vn_add_name(vp, dp); + + mutex_lock(&dentry_hash_lock); + LIST_INSERT_HEAD(&dentry_hash_table[dentry_hash(mp, path)], dp, d_link); + mutex_unlock(&dentry_hash_lock); + return dp; +}; + +struct dentry * +dentry_lookup(struct mount *mp, char *path) +{ + struct dentry *dp; + + mutex_lock(&dentry_hash_lock); + LIST_FOREACH(dp, &dentry_hash_table[dentry_hash(mp, path)], d_link) { + if (dp->d_mount == mp && !strncmp(dp->d_path, path, PATH_MAX)) { + dp->d_refcnt++; + mutex_unlock(&dentry_hash_lock); + return dp; + } + } + mutex_unlock(&dentry_hash_lock); + return nullptr; /* not found */ +} + +static void dentry_children_remove(struct dentry *dp) +{ + struct dentry *entry = nullptr; + + WITH_LOCK(dp->d_lock) { + LIST_FOREACH(entry, &dp->d_children, d_children_link) { + ASSERT(entry); + ASSERT(entry->d_refcnt > 0); + LIST_REMOVE(entry, d_link); + } + } +} + +void +dentry_move(struct dentry *dp, struct dentry *parent_dp, char *path) +{ + struct dentry *old_pdp = dp->d_parent; + char *old_path = dp->d_path; + + if (old_pdp) { + WITH_LOCK(old_pdp->d_lock) { + // Remove dp from its old parent's children list. + LIST_REMOVE(dp, d_children_link); + } + } + + if (parent_dp) { + dref(parent_dp); + WITH_LOCK(parent_dp->d_lock) { + // Insert dp into its new parent's children list. + LIST_INSERT_HEAD(&parent_dp->d_children, dp, d_children_link); + } + } + + WITH_LOCK(dentry_hash_lock) { + // Remove all dp's child dentries from the hashtable. + dentry_children_remove(dp); + // Remove dp with outdated hash info from the hashtable. + LIST_REMOVE(dp, d_link); + // Update dp. + dp->d_path = strdup(path); + dp->d_parent = parent_dp; + // Insert dp updated hash info into the hashtable. + LIST_INSERT_HEAD(&dentry_hash_table[dentry_hash(dp->d_mount, path)], + dp, d_link); + } + + if (old_pdp) { + drele(old_pdp); + } + + free(old_path); +} + +void +dentry_remove(struct dentry *dp) +{ + mutex_lock(&dentry_hash_lock); + LIST_REMOVE(dp, d_link); + /* put it on a fake list for drele() to work*/ + LIST_INSERT_HEAD(&fake, dp, d_link); + mutex_unlock(&dentry_hash_lock); +} + +void +dref(struct dentry *dp) +{ + ASSERT(dp); + ASSERT(dp->d_refcnt > 0); + + mutex_lock(&dentry_hash_lock); + dp->d_refcnt++; + mutex_unlock(&dentry_hash_lock); +} + +void +drele(struct dentry *dp) +{ + ASSERT(dp); + ASSERT(dp->d_refcnt > 0); + + mutex_lock(&dentry_hash_lock); + if (--dp->d_refcnt) { + mutex_unlock(&dentry_hash_lock); + return; + } + LIST_REMOVE(dp, d_link); + vn_del_name(dp->d_vnode, dp); + + mutex_unlock(&dentry_hash_lock); + + if (dp->d_parent) { + WITH_LOCK(dp->d_parent->d_lock) { + // Remove dp from its parent's children list. + LIST_REMOVE(dp, d_children_link); + } + drele(dp->d_parent); + } + + vrele(dp->d_vnode); + + free(dp->d_path); + free(dp); +} + +void +dentry_init(void) +{ + int i; + + for (i = 0; i < DENTRY_BUCKETS; i++) { + LIST_INIT(&dentry_hash_table[i]); + } +} diff --git a/lib/vfscore/fops.c b/lib/vfscore/fops.c new file mode 100644 index 00000000..3a8f98b4 --- /dev/null +++ b/lib/vfscore/fops.c @@ -0,0 +1,189 @@ +/* + * Copyright (C) 2013 Cloudius Systems, Ltd. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + + +#include <fcntl.h> +#include <sys/stat.h> +#include <osv/file.h> +#include <osv/poll.h> +#include <fs/vfs/vfs.h> +#include <osv/vfs_file.hh> +#include <osv/mmu.hh> +#include <osv/pagecache.hh> + +vfs_file::vfs_file(unsigned flags) + : file(flags, DTYPE_VNODE) +{ +} + +int vfs_file::close() +{ + auto fp = this; + struct vnode *vp = fp->f_dentry->d_vnode; + int error; + + vn_lock(vp); + error = VOP_CLOSE(vp, fp); + vn_unlock(vp); + + if (error) + return error; + + fp->f_dentry.reset(); + return 0; +} + +int vfs_file::read(struct uio *uio, int flags) +{ + auto fp = this; + struct vnode *vp = fp->f_dentry->d_vnode; + int error; + size_t count; + ssize_t bytes; + + bytes = uio->uio_resid; + + vn_lock(vp); + if ((flags & FOF_OFFSET) == 0) + uio->uio_offset = fp->f_offset; + + error = VOP_READ(vp, fp, uio, 0); + if (!error) { + count = bytes - uio->uio_resid; + if ((flags & FOF_OFFSET) == 0) + fp->f_offset += count; + } + vn_unlock(vp); + + return error; +} + + +int vfs_file::write(struct uio *uio, int flags) +{ + auto fp = this; + struct vnode *vp = fp->f_dentry->d_vnode; + int ioflags = 0; + int error; + size_t count; + ssize_t bytes; + + bytes = uio->uio_resid; + + vn_lock(vp); + + if (fp->f_flags & O_APPEND) + ioflags |= IO_APPEND; + if (fp->f_flags & (O_DSYNC|O_SYNC)) + ioflags |= IO_SYNC; + + if ((flags & FOF_OFFSET) == 0) + uio->uio_offset = fp->f_offset; + + error = VOP_WRITE(vp, uio, ioflags); + if (!error) { + count = bytes - uio->uio_resid; + if ((flags & FOF_OFFSET) == 0) + fp->f_offset += count; + } + + vn_unlock(vp); + return error; +} + +int vfs_file::ioctl(u_long com, void *data) +{ + auto fp = this; + struct vnode *vp = fp->f_dentry->d_vnode; + int error; + + vn_lock(vp); + error = VOP_IOCTL(vp, fp, com, data); + vn_unlock(vp); + + return error; +} + +int vfs_file::stat(struct stat *st) +{ + auto fp = this; + struct vnode *vp = fp->f_dentry->d_vnode; + int error; + + vn_lock(vp); + error = vn_stat(vp, st); + vn_unlock(vp); + + return error; +} + +int vfs_file::poll(int events) +{ + return poll_no_poll(events); +} + +int vfs_file::truncate(off_t len) +{ + // somehow this is handled outside file ops + abort(); +} + +int vfs_file::chmod(mode_t mode) +{ + // somehow this is handled outside file ops + abort(); +} + +bool vfs_file::map_page(uintptr_t off, mmu::hw_ptep<0> ptep, mmu::pt_element<0> pte, bool write, bool shared) +{ + return pagecache::get(this, off, ptep, pte, write, shared); +} + +bool vfs_file::put_page(void *addr, uintptr_t off, mmu::hw_ptep<0> ptep) +{ + return pagecache::release(this, addr, off, ptep); +} + +void vfs_file::sync(off_t start, off_t end) +{ + pagecache::sync(this, start, end); +} + +// Locking: VOP_CACHE will call into the filesystem, and that can trigger an +// eviction that will hold the mmu-side lock that protects the mappings +// Always follow that order. We however can't just get rid of the mmu-side lock, +// because not all invalidations will be synchronous. +int vfs_file::get_arcbuf(void* key, off_t offset) +{ + struct vnode *vp = f_dentry->d_vnode; + + iovec io[1]; + + io[0].iov_base = key; + uio data; + data.uio_iov = io; + data.uio_iovcnt = 1; + data.uio_offset = offset; + data.uio_resid = mmu::page_size; + data.uio_rw = UIO_READ; + + vn_lock(vp); + assert(VOP_CACHE(vp, this, &data) == 0); + vn_unlock(vp); + + return (data.uio_resid != 0) ? -1 : 0; +} + +std::unique_ptr<mmu::file_vma> vfs_file::mmap(addr_range range, unsigned flags, unsigned perm, off_t offset) +{ + auto fp = this; + struct vnode *vp = fp->f_dentry->d_vnode; + if (!vp->v_op->vop_cache || (vp->v_size < (off_t)mmu::page_size)) { + return mmu::default_file_mmap(this, range, flags, perm, offset); + } + return mmu::map_file_mmap(this, range, flags, perm, offset); +} diff --git a/lib/vfscore/include/vfscore/dentry.h b/lib/vfscore/include/vfscore/dentry.h new file mode 100644 index 00000000..a2545af8 --- /dev/null +++ b/lib/vfscore/include/vfscore/dentry.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#ifndef _OSV_DENTRY_H +#define _OSV_DENTRY_H 1 + +#include <osv/mutex.h> +#include <bsd/sys/sys/queue.h> + +struct vnode; + +struct dentry { + LIST_ENTRY(dentry) d_link; /* link for hash list */ + int d_refcnt; /* reference count */ + char *d_path; /* pointer to path in fs */ + struct vnode *d_vnode; + struct mount *d_mount; + struct dentry *d_parent; /* pointer to parent */ + LIST_ENTRY(dentry) d_names_link; /* link fo vnode::d_names */ + mutex_t d_lock; + LIST_HEAD(, dentry) d_children; + LIST_ENTRY(dentry) d_children_link; +}; + +#ifdef __cplusplus + +#include <boost/intrusive_ptr.hpp> + +using dentry_ref = boost::intrusive_ptr<dentry>; + +extern "C" { + void dref(struct dentry* dp); + void drele(struct dentry* dp); +}; + +inline void intrusive_ptr_add_ref(dentry* dp) { dref(dp); } +inline void intrusive_ptr_release(dentry* dp) { drele(dp); } + +#endif + +#endif /* _OSV_DENTRY_H */ diff --git a/lib/vfscore/include/vfscore/mount.h b/lib/vfscore/include/vfscore/mount.h new file mode 100644 index 00000000..7268d8ce --- /dev/null +++ b/lib/vfscore/include/vfscore/mount.h @@ -0,0 +1,171 @@ +/*- + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)mount.h 8.21 (Berkeley) 5/20/95 + */ + +#ifndef _SYS_MOUNT_H_ +#define _SYS_MOUNT_H_ + +#include <sys/cdefs.h> +#include <sys/statfs.h> +#include <osv/vnode.h> +#include <bsd/sys/sys/queue.h> + +__BEGIN_DECLS + +#ifdef _KERNEL + +/* + * Mount data + */ +struct mount { + struct vfsops *m_op; /* pointer to vfs operation */ + int m_flags; /* mount flag */ + int m_count; /* reference count */ + char m_path[PATH_MAX]; /* mounted path */ + char m_special[PATH_MAX]; /* resource */ + struct device *m_dev; /* mounted device */ + struct dentry *m_root; /* root vnode */ + struct dentry *m_covered; /* vnode covered on parent fs */ + void *m_data; /* private data for fs */ + fsid_t m_fsid; /* id that uniquely identifies the fs */ +}; + +#endif + +/* + * Mount flags. + */ +#define MNT_RDONLY 0x00000001 /* read only filesystem */ +#define MNT_SYNCHRONOUS 0x00000002 /* file system written synchronously */ +#define MNT_NOEXEC 0x00000004 /* can't exec from filesystem */ +#define MNT_NOSUID 0x00000008 /* don't honor setuid bits on fs */ +#define MNT_NODEV 0x00000010 /* don't interpret special files */ +#define MNT_UNION 0x00000020 /* union with underlying filesystem */ +#define MNT_ASYNC 0x00000040 /* file system written asynchronously */ + +/* + * Unmount flags. + */ +#define MNT_FORCE 0x00000001 /* forced unmount */ + +/* + * exported mount flags. + */ +#define MNT_EXRDONLY 0x00000080 /* exported read only */ +#define MNT_EXPORTED 0x00000100 /* file system is exported */ +#define MNT_DEFEXPORTED 0x00000200 /* exported to the world */ +#define MNT_EXPORTANON 0x00000400 /* use anon uid mapping for everyone */ +#define MNT_EXKERB 0x00000800 /* exported with Kerberos uid mapping */ + +/* + * Flags set by internal operations. + */ +#define MNT_LOCAL 0x00001000 /* filesystem is stored locally */ +#define MNT_QUOTA 0x00002000 /* quotas are enabled on filesystem */ +#define MNT_ROOTFS 0x00004000 /* identifies the root filesystem */ + +/* + * Mask of flags that are visible to statfs() + */ +#define MNT_VISFLAGMASK 0x0000ffff + +#ifdef _KERNEL + +/* + * Filesystem type switch table. + */ +struct vfssw { + const char *vs_name; /* name of file system */ + int (*vs_init)(void); /* initialize routine */ + struct vfsops *vs_op; /* pointer to vfs operation */ +}; + +/* + * Operations supported on virtual file system. + */ +struct vfsops { + int (*vfs_mount) (struct mount *, const char *, int, const void *); + int (*vfs_unmount) (struct mount *, int flags); + int (*vfs_sync) (struct mount *); + int (*vfs_vget) (struct mount *, struct vnode *); + int (*vfs_statfs) (struct mount *, struct statfs *); + struct vnops *vfs_vnops; +}; + +typedef int (*vfsop_mount_t)(struct mount *, const char *, int, const void *); +typedef int (*vfsop_umount_t)(struct mount *, int flags); +typedef int (*vfsop_sync_t)(struct mount *); +typedef int (*vfsop_vget_t)(struct mount *, struct vnode *); +typedef int (*vfsop_statfs_t)(struct mount *, struct statfs *); + +/* + * VFS interface + */ +#define VFS_MOUNT(MP, DEV, FL, DAT) ((MP)->m_op->vfs_mount)(MP, DEV, FL, DAT) +#define VFS_UNMOUNT(MP, FL) ((MP)->m_op->vfs_unmount)(MP, FL) +#define VFS_SYNC(MP) ((MP)->m_op->vfs_sync)(MP) +#define VFS_VGET(MP, VP) ((MP)->m_op->vfs_vget)(MP, VP) +#define VFS_STATFS(MP, SFP) ((MP)->m_op->vfs_statfs)(MP, SFP) + +#define VFS_NULL ((void *)vfs_null) + +int vfs_nullop(void); +int vfs_einval(void); + +void vfs_busy(struct mount *mp); +void vfs_unbusy(struct mount *mp); + +void release_mp_dentries(struct mount *mp); + +#endif + +__END_DECLS + +#ifdef __cplusplus + +#include <vector> +#include <string> + +namespace osv { + +struct mount_desc { + std::string special; + std::string path; + std::string type; + std::string options; +}; + +std::vector<mount_desc> current_mounts(); + +} + +#endif + +#endif /* !_SYS_MOUNT_H_ */ diff --git a/lib/vfscore/include/vfscore/prex.h b/lib/vfscore/include/vfscore/prex.h new file mode 100644 index 00000000..43650340 --- /dev/null +++ b/lib/vfscore/include/vfscore/prex.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2013 Cloudius Systems, Ltd. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#ifndef _OSV_PREX_H +#define _OSV_PREX_H 1 + + +#include <unistd.h> +#include <osv/fcntl.h> + +__BEGIN_DECLS + +#define __packed __attribute__((__packed__)) + +#define BSIZE 512 /* size of secondary block (bytes) */ + +#define DO_RDWR 0x2 + +#define PAGE_SIZE 4096 +#define PAGE_MASK (PAGE_SIZE-1) +#define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK) + +size_t strlcat(char *dst, const char *src, size_t siz); +size_t strlcpy(char *dst, const char *src, size_t siz); + +void sys_panic(const char *); + +__END_DECLS + +#endif /* _OSV_PREX_H */ diff --git a/lib/vfscore/include/vfscore/uio.h b/lib/vfscore/include/vfscore/uio.h new file mode 100644 index 00000000..696b01cf --- /dev/null +++ b/lib/vfscore/include/vfscore/uio.h @@ -0,0 +1,89 @@ +/*- + * Copyright (c) 1982, 1986, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uio.h 8.5 (Berkeley) 2/22/94 + * $FreeBSD$ + */ + +#ifndef _UIO_H_ +#define _UIO_H_ + +#include <sys/cdefs.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <limits.h> + +__BEGIN_DECLS + +enum uio_rw { UIO_READ, UIO_WRITE }; + +/* + * Safe default to prevent possible overflows in user code, otherwise could + * be SSIZE_T_MAX. + */ +#define IOSIZE_MAX INT_MAX + +#define UIO_MAXIOV 1024 + +#define UIO_SYSSPACE 0 + +struct uio { + struct iovec *uio_iov; /* scatter/gather list */ + int uio_iovcnt; /* length of scatter/gather list */ + off_t uio_offset; /* offset in target object */ + ssize_t uio_resid; /* remaining bytes to process */ + enum uio_rw uio_rw; /* operation */ +}; + +int uiomove(void *cp, int n, struct uio *uio); + +__END_DECLS + +#ifdef __cplusplus + +template <typename F> +static inline void linearize_uio_write(struct uio *uio, int ioflag, F f) +{ + while (uio->uio_resid > 0) { + struct iovec *iov = uio->uio_iov; + + if (iov->iov_len) { + f(reinterpret_cast<const char *>(iov->iov_base), + iov->iov_len); + } + + uio->uio_iov++; + uio->uio_iovcnt--; + uio->uio_resid -= iov->iov_len; + uio->uio_offset += iov->iov_len; + } +} + +#endif + +#endif /* !_UIO_H_ */ diff --git a/lib/vfscore/include/vfscore/vnode.h b/lib/vfscore/include/vfscore/vnode.h new file mode 100644 index 00000000..e35aa830 --- /dev/null +++ b/lib/vfscore/include/vfscore/vnode.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2005-2007, Kohsuke Ohtani + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_VNODE_H_ +#define _SYS_VNODE_H_ + +#ifdef _KERNEL + +#include <sys/cdefs.h> +#include <sys/stat.h> +#include <osv/prex.h> +#include <osv/uio.h> +#include <osv/mutex.h> +#include "file.h" +#include "dirent.h" + +__BEGIN_DECLS + +struct vfsops; +struct vnops; +struct vnode; +struct file; + +/* + * Vnode types. + */ +enum vtype { + VNON, /* no type */ + VREG, /* regular file */ + VDIR, /* directory */ + VBLK, /* block device */ + VCHR, /* character device */ + VLNK, /* symbolic link */ + VSOCK, /* socks */ + VFIFO, /* FIFO */ + VBAD +}; + +/* + * Reading or writing any of these items requires holding the + * appropriate lock. + */ +struct vnode { + uint64_t v_ino; /* inode number */ + LIST_ENTRY(vnode) v_link; /* link for hash list */ + struct mount *v_mount; /* mounted vfs pointer */ + struct vnops *v_op; /* vnode operations */ + int v_refcnt; /* reference count */ + int v_type; /* vnode type */ + int v_flags; /* vnode flag */ + mode_t v_mode; /* file mode */ + off_t v_size; /* file size */ + mutex_t v_lock; /* lock for this vnode */ + LIST_HEAD(, dentry) v_names; /* directory entries pointing at this */ + int v_nrlocks; /* lock count (for debug) */ + void *v_data; /* private data for fs */ +}; + +/* flags for vnode */ +#define VROOT 0x0001 /* root of its file system */ +#define VISTTY 0x0002 /* device is tty */ +#define VPROTDEV 0x0004 /* protected device */ + +/* + * Vnode attribute + */ +struct vattr { + unsigned int va_mask; + enum vtype va_type; /* vnode type */ + mode_t va_mode; /* file access mode */ + nlink_t va_nlink; + uid_t va_uid; + gid_t va_gid; + dev_t va_fsid; /* id of the underlying filesystem */ + ino_t va_nodeid; + struct timespec va_atime; + struct timespec va_mtime; + struct timespec va_ctime; + dev_t va_rdev; + uint64_t va_nblocks; + off_t va_size; +}; + +/* + * Modes. + */ +#define VAPPEND 00010 +#define VREAD 00004 /* read, write, execute permissions */ +#define VWRITE 00002 +#define VEXEC 00001 + +#define IO_APPEND 0x0001 +#define IO_SYNC 0x0002 + +/* + * ARC actions + */ +#define ARC_ACTION_QUERY 0 +#define ARC_ACTION_HOLD 1 +#define ARC_ACTION_RELEASE 2 + +typedef int (*vnop_open_t) (struct file *); +typedef int (*vnop_close_t) (struct vnode *, struct file *); +typedef int (*vnop_read_t) (struct vnode *, struct file *, struct uio *, int); +typedef int (*vnop_write_t) (struct vnode *, struct uio *, int); +typedef int (*vnop_seek_t) (struct vnode *, struct file *, off_t, off_t); +typedef int (*vnop_ioctl_t) (struct vnode *, struct file *, u_long, void *); +typedef int (*vnop_fsync_t) (struct vnode *, struct file *); +typedef int (*vnop_readdir_t) (struct vnode *, struct file *, struct dirent *); +typedef int (*vnop_lookup_t) (struct vnode *, char *, struct vnode **); +typedef int (*vnop_create_t) (struct vnode *, char *, mode_t); +typedef int (*vnop_remove_t) (struct vnode *, struct vnode *, char *); +typedef int (*vnop_rename_t) (struct vnode *, struct vnode *, char *, + struct vnode *, struct vnode *, char *); +typedef int (*vnop_mkdir_t) (struct vnode *, char *, mode_t); +typedef int (*vnop_rmdir_t) (struct vnode *, struct vnode *, char *); +typedef int (*vnop_getattr_t) (struct vnode *, struct vattr *); +typedef int (*vnop_setattr_t) (struct vnode *, struct vattr *); +typedef int (*vnop_inactive_t) (struct vnode *); +typedef int (*vnop_truncate_t) (struct vnode *, off_t); +typedef int (*vnop_link_t) (struct vnode *, struct vnode *, char *); +typedef int (*vnop_cache_t) (struct vnode *, struct file *, struct uio *); +typedef int (*vnop_fallocate_t) (struct vnode *, int, loff_t, loff_t); +typedef int (*vnop_readlink_t) (struct vnode *, struct uio *); +typedef int (*vnop_symlink_t) (struct vnode *, char *, char *); + +/* + * vnode operations + */ +struct vnops { + vnop_open_t vop_open; + vnop_close_t vop_close; + vnop_read_t vop_read; + vnop_write_t vop_write; + vnop_seek_t vop_seek; + vnop_ioctl_t vop_ioctl; + vnop_fsync_t vop_fsync; + vnop_readdir_t vop_readdir; + vnop_lookup_t vop_lookup; + vnop_create_t vop_create; + vnop_remove_t vop_remove; + vnop_rename_t vop_rename; + vnop_mkdir_t vop_mkdir; + vnop_rmdir_t vop_rmdir; + vnop_getattr_t vop_getattr; + vnop_setattr_t vop_setattr; + vnop_inactive_t vop_inactive; + vnop_truncate_t vop_truncate; + vnop_link_t vop_link; + vnop_cache_t vop_cache; + vnop_fallocate_t vop_fallocate; + vnop_readlink_t vop_readlink; + vnop_symlink_t vop_symlink; +}; + +/* + * vnode interface + */ +#define VOP_OPEN(VP, FP) ((VP)->v_op->vop_open)(FP) +#define VOP_CLOSE(VP, FP) ((VP)->v_op->vop_close)(VP, FP) +#define VOP_READ(VP, FP, U, F) ((VP)->v_op->vop_read)(VP, FP, U, F) +#define VOP_CACHE(VP, FP, U) ((VP)->v_op->vop_cache)(VP, FP, U) +#define VOP_WRITE(VP, U, F) ((VP)->v_op->vop_write)(VP, U, F) +#define VOP_SEEK(VP, FP, OLD, NEW) ((VP)->v_op->vop_seek)(VP, FP, OLD, NEW) +#define VOP_IOCTL(VP, FP, C, A) ((VP)->v_op->vop_ioctl)(VP, FP, C, A) +#define VOP_FSYNC(VP, FP) ((VP)->v_op->vop_fsync)(VP, FP) +#define VOP_READDIR(VP, FP, DIR) ((VP)->v_op->vop_readdir)(VP, FP, DIR) +#define VOP_LOOKUP(DVP, N, VP) ((DVP)->v_op->vop_lookup)(DVP, N, VP) +#define VOP_CREATE(DVP, N, M) ((DVP)->v_op->vop_create)(DVP, N, M) +#define VOP_REMOVE(DVP, VP, N) ((DVP)->v_op->vop_remove)(DVP, VP, N) +#define VOP_RENAME(DVP1, VP1, N1, DVP2, VP2, N2) \ + ((DVP1)->v_op->vop_rename)(DVP1, VP1, N1, DVP2, VP2, N2) +#define VOP_MKDIR(DVP, N, M) ((DVP)->v_op->vop_mkdir)(DVP, N, M) +#define VOP_RMDIR(DVP, VP, N) ((DVP)->v_op->vop_rmdir)(DVP, VP, N) +#define VOP_GETATTR(VP, VAP) ((VP)->v_op->vop_getattr)(VP, VAP) +#define VOP_SETATTR(VP, VAP) ((VP)->v_op->vop_setattr)(VP, VAP) +#define VOP_INACTIVE(VP) ((VP)->v_op->vop_inactive)(VP) +#define VOP_TRUNCATE(VP, N) ((VP)->v_op->vop_truncate)(VP, N) +#define VOP_LINK(DVP, SVP, N) ((DVP)->v_op->vop_link)(DVP, SVP, N) +#define VOP_FALLOCATE(VP, M, OFF, LEN) ((VP)->v_op->vop_fallocate)(VP, M, OFF, LEN) +#define VOP_READLINK(VP, U) ((VP)->v_op->vop_readlink)(VP, U) +#define VOP_SYMLINK(DVP, OP, NP) ((DVP)->v_op->vop_symlink)(DVP, OP, NP) + +int vop_nullop(void); +int vop_einval(void); +int vop_eperm(void); +int vop_erofs(void); +struct vnode *vn_lookup(struct mount *, uint64_t); +void vn_lock(struct vnode *); +void vn_unlock(struct vnode *); +int vn_stat(struct vnode *, struct stat *); +int vn_settimes(struct vnode *, struct timespec[2]); +int vn_setmode(struct vnode *, mode_t mode); +int vn_access(struct vnode *, int); +int vget(struct mount *, uint64_t ino, struct vnode **vpp); +void vput(struct vnode *); +void vref(struct vnode *); +void vrele(struct vnode *); +void vflush(struct mount *); +void vn_add_name(struct vnode *, struct dentry *); +void vn_del_name(struct vnode *, struct dentry *); + +extern enum vtype iftovt_tab[]; +extern int vttoif_tab[]; +#define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) +#define VTTOIF(indx) (vttoif_tab[(int)(indx)]) +#define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) + +#define VATTR_NULL(vp) (*(vp) = (vattr_t){}) + +static inline void vnode_pager_setsize(struct vnode *vp, off_t size) +{ + vp->v_size = size; +} + +__END_DECLS + +#endif + +#endif /* !_SYS_VNODE_H_ */ diff --git a/lib/vfscore/lookup.c b/lib/vfscore/lookup.c new file mode 100644 index 00000000..ad03fe25 --- /dev/null +++ b/lib/vfscore/lookup.c @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2005-2007, Kohsuke Ohtani + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <stdlib.h> +#include <sys/param.h> + +#include <osv/dentry.h> +#include <osv/vnode.h> +#include "vfs.h" + +static ssize_t +read_link(struct vnode *vp, char *buf, size_t bufsz, ssize_t *sz) +{ + struct iovec iov = {buf, bufsz}; + struct uio uio = {&iov, 1, 0, (ssize_t) bufsz, UIO_READ}; + int rc; + + *sz = 0; + vn_lock(vp); + rc = VOP_READLINK(vp, &uio); + vn_unlock(vp); + + if (rc != 0) { + return (rc); + } + + *sz = bufsz - uio.uio_resid; + return (0); +} + +int +namei_follow_link(struct dentry *dp, char *node, char *name, char *fp, size_t mountpoint_len) +{ + std::unique_ptr<char []> link (new char[PATH_MAX]); + std::unique_ptr<char []> t (new char[PATH_MAX]); + char *lp; + int error; + ssize_t sz; + char *p; + int c; + + lp = link.get(); + error = read_link(dp->d_vnode, lp, PATH_MAX, &sz); + if (error != 0) { + return (error); + } + lp[sz] = 0; + + p = fp + mountpoint_len + strlen(node); + c = strlen(node) - strlen(name) - 1; + node[c] = 0; + + if (lp[0] == '/') { + strlcat(lp, p, PATH_MAX); + strlcpy(fp, lp, PATH_MAX); + } else { + strlcpy(t.get(), p, PATH_MAX); + strlcpy(node, fp, mountpoint_len + c + 1); + path_conv(node, lp, fp); + strlcat(fp, t.get(), PATH_MAX); + } + node[0] = 0; + name[0] = 0; + return (0); +} +/* + * Convert a pathname into a pointer to a dentry + * + * @path: full path name. + * @dpp: dentry to be returned. + */ +int +namei(const char *path, struct dentry **dpp) +{ + char *p; + char node[PATH_MAX]; + char name[PATH_MAX]; + std::unique_ptr<char []> fp (new char [PATH_MAX]); + std::unique_ptr<char []> t (new char [PATH_MAX]); + struct mount *mp; + struct dentry *dp, *ddp; + struct vnode *dvp, *vp; + int error, i; + int links_followed; + bool need_continue; + + DPRINTF(VFSDB_VNODE, ("namei: path=%s\n", path)); + + links_followed = 0; + strlcpy(fp.get(), path, PATH_MAX); + + do { + need_continue = false; + /* + * Convert a full path name to its mount point and + * the local node in the file system. + */ + if (vfs_findroot(fp.get(), &mp, &p)) { + return ENOTDIR; + } + int mountpoint_len = p - fp.get() - 1; + strlcpy(node, "/", sizeof(node)); + strlcat(node, p, sizeof(node)); + dp = dentry_lookup(mp, node); + if (dp) { + /* vnode is already active. */ + *dpp = dp; + return 0; + } + /* + * Find target vnode, started from root directory. + * This is done to attach the fs specific data to + * the target vnode. + */ + ddp = mp->m_root; + if (!ddp) { + sys_panic("VFS: no root"); + } + dref(ddp); + + node[0] = '\0'; + + while (*p != '\0') { + /* + * Get lower directory/file name. + */ + while (*p == '/') { + p++; + } + + if (*p == '\0') { + break; + } + + for (i = 0; i < PATH_MAX; i++) { + if (*p == '\0' || *p == '/') { + break; + } + name[i] = *p++; + } + name[i] = '\0'; + + /* + * Get a vnode for the target. + */ + strlcat(node, "/", sizeof(node)); + strlcat(node, name, sizeof(node)); + dvp = ddp->d_vnode; + vn_lock(dvp); + dp = dentry_lookup(mp, node); + if (dp == nullptr) { + /* Find a vnode in this directory. */ + error = VOP_LOOKUP(dvp, name, &vp); + if (error) { + vn_unlock(dvp); + drele(ddp); + return error; + } + + dp = dentry_alloc(ddp, vp, node); + vput(vp); + + if (!dp) { + vn_unlock(dvp); + drele(ddp); + return ENOMEM; + } + } + vn_unlock(dvp); + drele(ddp); + ddp = dp; + + if (dp->d_vnode->v_type == VLNK) { + error = namei_follow_link(dp, node, name, fp.get(), mountpoint_len); + if (error) { + drele(dp); + return (error); + } + + drele(dp); + + p = fp.get(); + dp = nullptr; + ddp = nullptr; + vp = nullptr; + dvp = nullptr; + name[0] = 0; + node[0] = 0; + + if (++links_followed >= MAXSYMLINKS) { + return (ELOOP); + } + need_continue = true; + break; + } + + if (*p == '/' && ddp->d_vnode->v_type != VDIR) { + drele(ddp); + return ENOTDIR; + } + } + } while (need_continue == true); + + *dpp = dp; + return 0; +} + +/* + * Convert last component in the path to pointer to dentry + * + * @path: full path name + * @ddp : pointer to dentry of parent + * @dpp : dentry to be returned + */ +int +namei_last_nofollow(char *path, struct dentry *ddp, struct dentry **dpp) +{ + char *name; + int error; + struct mount *mp; + char *p; + struct dentry *dp; + struct vnode *dvp; + struct vnode *vp; + std::unique_ptr<char []> node (new char[PATH_MAX]); + + dvp = nullptr; + + if (path[0] != '/') { + return (ENOTDIR); + } + + name = strrchr(path, '/'); + if (name == nullptr) { + return (ENOENT); + } + name++; + + error = vfs_findroot(path, &mp, &p); + if (error != 0) { + return (ENOTDIR); + } + + strlcpy(node.get(), "/", PATH_MAX); + strlcat(node.get(), p, PATH_MAX); + + // We want to treat things like /tmp/ the same as /tmp. Best way to do that + // is to ignore the last character, except when we're stating the root. + auto l = strlen(node.get()) - 1; + if (l && node.get()[l] == '/') { + node.get()[l] = '\0'; + } + + dvp = ddp->d_vnode; + vn_lock(dvp); + dp = dentry_lookup(mp, node.get()); + if (dp == nullptr) { + error = VOP_LOOKUP(dvp, name, &vp); + if (error != 0) { + goto out; + } + + dp = dentry_alloc(ddp, vp, node.get()); + vput(vp); + + if (dp == nullptr) { + error = ENOMEM; + goto out; + } + } + + *dpp = dp; + error = 0; +out: + if (dvp != nullptr) { + vn_unlock(dvp); + } + return (error); +} + +/* + * Search a pathname. + * This is a very central but not so complicated routine. ;-P + * + * @path: full path. + * @dpp: pointer to dentry for directory. + * @name: if non-null, pointer to file name in path. + * + * This routine returns a locked directory vnode and file name. + */ +int +lookup(char *path, struct dentry **dpp, char **name) +{ + char buf[PATH_MAX]; + char root[] = "/"; + char *file, *dir; + struct dentry *dp; + int error; + + DPRINTF(VFSDB_VNODE, ("lookup: path=%s\n", path)); + + /* + * Get the path for directory. + */ + strlcpy(buf, path, sizeof(buf)); + file = strrchr(buf, '/'); + if (!buf[0]) { + return ENOTDIR; + } + if (file == buf) { + dir = root; + } else { + *file = '\0'; + dir = buf; + } + /* + * Get the vnode for directory + */ + if ((error = namei(dir, &dp)) != 0) { + return error; + } + if (dp->d_vnode->v_type != VDIR) { + drele(dp); + return ENOTDIR; + } + + *dpp = dp; + + if (name) { + /* + * Get the file name + */ + *name = strrchr(path, '/') + 1; + } + return 0; +} + +/* + * vnode_init() is called once (from vfs_init) + * in initialization. + */ +void +lookup_init(void) +{ + dentry_init(); +} diff --git a/lib/vfscore/main.c b/lib/vfscore/main.c new file mode 100644 index 00000000..cd141117 --- /dev/null +++ b/lib/vfscore/main.c @@ -0,0 +1,2413 @@ +/* + * Copyright (C) 2013 Cloudius Systems, Ltd. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +/* + * Copyright (c) 2005-2007, Kohsuke Ohtani + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/sendfile.h> + +#include <limits.h> +#include <unistd.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <signal.h> +#define open __open_variadic +#define fcntl __fcntl_variadic +#include <fcntl.h> +#undef open +#undef fcntl + +#include <osv/prex.h> +#include <osv/vnode.h> +#include <osv/stubbing.hh> +#include <osv/ioctl.h> +#include <osv/trace.hh> +#include <osv/run.hh> +#include <drivers/console.hh> + +#include "vfs.h" + +#include "libc/internal/libc.h" + +#include <algorithm> +#include <unordered_map> + +#include <sys/file.h> + +#include "fs/fs.hh" +#include "libc/libc.hh" + +#include <mntent.h> +#include <sys/mman.h> + +#include <osv/clock.hh> +#include <api/utime.h> +#include <chrono> + +using namespace std; + + +#ifdef DEBUG_VFS +int vfs_debug = VFSDB_FLAGS; +#endif + +std::atomic<mode_t> global_umask{S_IWGRP | S_IWOTH}; + +static inline mode_t apply_umask(mode_t mode) +{ + return mode & ~global_umask.load(std::memory_order_relaxed); +} + +TRACEPOINT(trace_vfs_open, "\"%s\" 0x%x 0%0o", const char*, int, mode_t); +TRACEPOINT(trace_vfs_open_ret, "%d", int); +TRACEPOINT(trace_vfs_open_err, "%d", int); + +struct task *main_task; /* we only have a single process */ + +extern "C" +int open(const char *pathname, int flags, ...) +{ + mode_t mode = 0; + if (flags & O_CREAT) { + va_list ap; + va_start(ap, flags); + mode = apply_umask(va_arg(ap, mode_t)); + va_end(ap); + } + + trace_vfs_open(pathname, flags, mode); + + struct task *t = main_task; + char path[PATH_MAX]; + struct file *fp; + int fd, error; + int acc; + + acc = 0; + switch (flags & O_ACCMODE) { + case O_RDONLY: + acc = VREAD; + break; + case O_WRONLY: + acc = VWRITE; + break; + case O_RDWR: + acc = VREAD | VWRITE; + break; + } + + error = task_conv(t, pathname, acc, path); + if (error) + goto out_errno; + + error = sys_open(path, flags, mode, &fp); + if (error) + goto out_errno; + + error = fdalloc(fp, &fd); + if (error) + goto out_fput; + fdrop(fp); + trace_vfs_open_ret(fd); + return fd; + + out_fput: + fdrop(fp); + out_errno: + errno = error; + trace_vfs_open_err(error); + return -1; +} + +LFS64(open); + +int openat(int dirfd, const char *pathname, int flags, ...) +{ + mode_t mode = 0; + if (flags & O_CREAT) { + va_list ap; + va_start(ap, flags); + mode = apply_umask(va_arg(ap, mode_t)); + va_end(ap); + } + + if (pathname[0] == '/' || dirfd == AT_FDCWD) { + return open(pathname, flags, mode); + } + + struct file *fp; + int error = fget(dirfd, &fp); + if (error) { + errno = error; + return -1; + } + + struct vnode *vp = fp->f_dentry->d_vnode; + vn_lock(vp); + + std::unique_ptr<char []> up (new char[PATH_MAX]); + char *p = up.get(); + + /* build absolute path */ + strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX); + strlcat(p, fp->f_dentry->d_path, PATH_MAX); + strlcat(p, "/", PATH_MAX); + strlcat(p, pathname, PATH_MAX); + + error = open(p, flags, mode); + + vn_unlock(vp); + fdrop(fp); + + return error; +} +LFS64(openat); + +// open() has an optional third argument, "mode", which is only needed in +// some cases (when the O_CREAT mode is used). As a safety feature, recent +// versions of Glibc add a feature where open() with two arguments is replaced +// by a call to __open_2(), which verifies it isn't called with O_CREATE. +extern "C" int __open_2(const char *pathname, int flags) +{ + assert(!(flags & O_CREAT)); + return open(pathname, flags, 0); +} + +extern "C" int __open64_2(const char *file, int flags) +{ + if (flags & O_CREAT) { + errno = EINVAL; + return -1; + } + + return open64(file, flags); +} + +int creat(const char *pathname, mode_t mode) +{ + return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode); +} +LFS64(creat); + +TRACEPOINT(trace_vfs_close, "%d", int); +TRACEPOINT(trace_vfs_close_ret, ""); +TRACEPOINT(trace_vfs_close_err, "%d", int); + +int close(int fd) +{ + int error; + + trace_vfs_close(fd); + error = fdclose(fd); + if (error) + goto out_errno; + + trace_vfs_close_ret(); + return 0; + + out_errno: + trace_vfs_close_err(error); + errno = error; + return -1; +} + +TRACEPOINT(trace_vfs_mknod, "\"%s\" 0%0o 0x%x", const char*, mode_t, dev_t); +TRACEPOINT(trace_vfs_mknod_ret, ""); +TRACEPOINT(trace_vfs_mknod_err, "%d", int); + + +extern "C" +int __xmknod(int ver, const char *pathname, mode_t mode, dev_t *dev) +{ + assert(ver == 0); // On x86-64 Linux, _MKNOD_VER_LINUX is 0. + struct task *t = main_task; + char path[PATH_MAX]; + int error; + + trace_vfs_mknod(pathname, mode, *dev); + if ((error = task_conv(t, pathname, VWRITE, path)) != 0) + goto out_errno; + + error = sys_mknod(path, mode); + if (error) + goto out_errno; + + trace_vfs_mknod_ret(); + return 0; + + out_errno: + trace_vfs_mknod_err(error); + errno = error; + return -1; +} + +int mknod(const char *pathname, mode_t mode, dev_t dev) +{ + return __xmknod(0, pathname, mode, &dev); +} + + +TRACEPOINT(trace_vfs_lseek, "%d 0x%x %d", int, off_t, int); +TRACEPOINT(trace_vfs_lseek_ret, "0x%x", off_t); +TRACEPOINT(trace_vfs_lseek_err, "%d", int); + +off_t lseek(int fd, off_t offset, int whence) +{ + struct file *fp; + off_t org; + int error; + + trace_vfs_lseek(fd, offset, whence); + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = sys_lseek(fp, offset, whence, &org); + fdrop(fp); + + if (error) + goto out_errno; + trace_vfs_lseek_ret(org); + return org; + + out_errno: + trace_vfs_lseek_err(error); + errno = error; + return -1; +} + +LFS64(lseek); + +TRACEPOINT(trace_vfs_pread, "%d %p 0x%x 0x%x", int, void*, size_t, off_t); +TRACEPOINT(trace_vfs_pread_ret, "0x%x", ssize_t); +TRACEPOINT(trace_vfs_pread_err, "%d", int); + +// In BSD's internal implementation of read() and write() code, for example +// sosend_generic(), a partial read or write returns both an EWOULDBLOCK error +// *and* a non-zero number of written bytes. In that case, we need to zero the +// error, so the system call appear a successful partial read/write. +// In FreeBSD, dofilewrite() and dofileread() (sys_generic.c) do this too. +static inline bool has_error(int error, int bytes) +{ + return error && ( + (bytes == 0) || + (error != EWOULDBLOCK && error != EINTR && error != ERESTART)); +} + + +ssize_t pread(int fd, void *buf, size_t count, off_t offset) +{ + trace_vfs_pread(fd, buf, count, offset); + struct iovec iov = { + .iov_base = buf, + .iov_len = count, + }; + struct file *fp; + size_t bytes; + int error; + + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = sys_read(fp, &iov, 1, offset, &bytes); + fdrop(fp); + + if (has_error(error, bytes)) + goto out_errno; + trace_vfs_pread_ret(bytes); + return bytes; + + out_errno: + trace_vfs_pread_err(error); + errno = error; + return -1; +} + +LFS64(pread); + +ssize_t read(int fd, void *buf, size_t count) +{ + return pread(fd, buf, count, -1); +} + +TRACEPOINT(trace_vfs_pwrite, "%d %p 0x%x 0x%x", int, const void*, size_t, off_t); +TRACEPOINT(trace_vfs_pwrite_ret, "0x%x", ssize_t); +TRACEPOINT(trace_vfs_pwrite_err, "%d", int); + +ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) +{ + trace_vfs_pwrite(fd, buf, count, offset); + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = count, + }; + struct file *fp; + size_t bytes; + int error; + + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = sys_write(fp, &iov, 1, offset, &bytes); + fdrop(fp); + + if (has_error(error, bytes)) + goto out_errno; + trace_vfs_pwrite_ret(bytes); + return bytes; + + out_errno: + trace_vfs_pwrite_err(error); + errno = error; + return -1; +} + +LFS64(pwrite); + +ssize_t write(int fd, const void *buf, size_t count) +{ + return pwrite(fd, buf, count, -1); +} + +ssize_t preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + struct file *fp; + size_t bytes; + int error; + + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = sys_read(fp, iov, iovcnt, offset, &bytes); + fdrop(fp); + + if (has_error(error, bytes)) + goto out_errno; + return bytes; + + out_errno: + errno = error; + return -1; +} + +LFS64(preadv); + +ssize_t readv(int fd, const struct iovec *iov, int iovcnt) +{ + return preadv(fd, iov, iovcnt, -1); +} + +TRACEPOINT(trace_vfs_pwritev, "%d %p 0x%x 0x%x", int, const struct iovec*, int, off_t); +TRACEPOINT(trace_vfs_pwritev_ret, "0x%x", ssize_t); +TRACEPOINT(trace_vfs_pwritev_err, "%d", int); + +ssize_t pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + struct file *fp; + size_t bytes; + int error; + + trace_vfs_pwritev(fd, iov, iovcnt, offset); + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = sys_write(fp, iov, iovcnt, offset, &bytes); + fdrop(fp); + + if (has_error(error, bytes)) + goto out_errno; + trace_vfs_pwritev_ret(bytes); + return bytes; + + out_errno: + trace_vfs_pwritev_err(error); + errno = error; + return -1; +} +LFS64(pwritev); + +ssize_t writev(int fd, const struct iovec *iov, int iovcnt) +{ + return pwritev(fd, iov, iovcnt, -1); +} + +TRACEPOINT(trace_vfs_ioctl, "%d 0x%x", int, unsigned long); +TRACEPOINT(trace_vfs_ioctl_ret, ""); +TRACEPOINT(trace_vfs_ioctl_err, "%d", int); + +int ioctl(int fd, unsigned long int request, ...) +{ + struct file *fp; + int error; + va_list ap; + void* arg; + + trace_vfs_ioctl(fd, request); + /* glibc ABI provides a variadic prototype for ioctl so we need to agree + * with it, since we now include sys/ioctl.h + * read the first argument and pass it to sys_ioctl() */ + va_start(ap, request); + arg = va_arg(ap, void*); + va_end(ap); + + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = sys_ioctl(fp, request, arg); + fdrop(fp); + + if (error) + goto out_errno; + trace_vfs_ioctl_ret(); + return 0; + + out_errno: + trace_vfs_ioctl_err(error); + errno = error; + return -1; +} + +TRACEPOINT(trace_vfs_fsync, "%d", int); +TRACEPOINT(trace_vfs_fsync_ret, ""); +TRACEPOINT(trace_vfs_fsync_err, "%d", int); + +int fsync(int fd) +{ + struct file *fp; + int error; + + trace_vfs_fsync(fd); + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = sys_fsync(fp); + fdrop(fp); + + if (error) + goto out_errno; + trace_vfs_fsync_ret(); + return 0; + + out_errno: + trace_vfs_fsync_err(error); + errno = error; + return -1; +} + +int fdatasync(int fd) +{ + // TODO: See if we can do less than fsync(). + return fsync(fd); +} + +TRACEPOINT(trace_vfs_fstat, "%d %p", int, struct stat*); +TRACEPOINT(trace_vfs_fstat_ret, ""); +TRACEPOINT(trace_vfs_fstat_err, "%d", int); + +extern "C" +int __fxstat(int ver, int fd, struct stat *st) +{ + struct file *fp; + int error; + + trace_vfs_fstat(fd, st); + + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = sys_fstat(fp, st); + fdrop(fp); + + if (error) + goto out_errno; + trace_vfs_fstat_ret(); + return 0; + + out_errno: + trace_vfs_fstat_err(error); + errno = error; + return -1; +} + +LFS64(__fxstat); + +extern "C" +int fstat(int fd, struct stat *st) +{ + return __fxstat(1, fd, st); +} + +LFS64(fstat); + +extern "C" +int __fxstatat(int ver, int dirfd, const char *pathname, struct stat *st, + int flags) +{ + if (flags & AT_SYMLINK_NOFOLLOW) { + UNIMPLEMENTED("fstatat() with AT_SYMLINK_NOFOLLOW"); + } + + if (pathname[0] == '/' || dirfd == AT_FDCWD) { + return stat(pathname, st); + } + // If AT_EMPTY_PATH and pathname is an empty string, fstatat() operates on + // dirfd itself, and in that case it doesn't have to be a directory. + if ((flags & AT_EMPTY_PATH) && !pathname[0]) { + return fstat(dirfd, st); + } + + struct file *fp; + int error = fget(dirfd, &fp); + if (error) { + errno = error; + return -1; + } + + struct vnode *vp = fp->f_dentry->d_vnode; + vn_lock(vp); + + std::unique_ptr<char []> up (new char[PATH_MAX]); + char *p = up.get(); + /* build absolute path */ + strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX); + strlcat(p, fp->f_dentry->d_path, PATH_MAX); + strlcat(p, "/", PATH_MAX); + strlcat(p, pathname, PATH_MAX); + + error = stat(p, st); + + vn_unlock(vp); + fdrop(fp); + + return error; +} + +LFS64(__fxstatat); + +extern "C" +int fstatat(int dirfd, const char *path, struct stat *st, int flags) +{ + return __fxstatat(1, dirfd, path, st, flags); +} + +LFS64(fstatat); + +extern "C" int flock(int fd, int operation) +{ + if (!fileref_from_fd(fd)) { + return libc_error(EBADF); + } + + switch (operation) { + case LOCK_SH: + case LOCK_SH | LOCK_NB: + case LOCK_EX: + case LOCK_EX | LOCK_NB: + case LOCK_UN: + break; + default: + return libc_error(EINVAL); + } + + return 0; +} + +TRACEPOINT(trace_vfs_readdir, "%d %p", int, dirent*); +TRACEPOINT(trace_vfs_readdir_ret, ""); +TRACEPOINT(trace_vfs_readdir_err, "%d", int); + +struct __dirstream +{ + int fd; +}; + +DIR *opendir(const char *path) +{ + DIR *dir = new DIR; + + if (!dir) + return libc_error_ptr<DIR>(ENOMEM); + + dir->fd = open(path, O_RDONLY); + if (dir->fd < 0) { + delete dir; + return nullptr; + } + return dir; +} + +DIR *fdopendir(int fd) +{ + DIR *dir; + struct stat st; + if (fstat(fd, &st) < 0) { + return nullptr; + } + if (!S_ISDIR(st.st_mode)) { + errno = ENOTDIR; + return nullptr; + } + dir = new DIR; + dir->fd = fd; + return dir; + +} + +int dirfd(DIR *dirp) +{ + if (!dirp) { + return libc_error(EINVAL); + } + + return dirp->fd; +} + +int closedir(DIR *dir) +{ + close(dir->fd); + delete dir; + return 0; +} + +struct dirent *readdir(DIR *dir) +{ + static __thread struct dirent entry, *result; + int ret; + + ret = readdir_r(dir, &entry, &result); + if (ret) + return libc_error_ptr<struct dirent>(ret); + + errno = 0; + return result; +} + +int readdir_r(DIR *dir, struct dirent *entry, struct dirent **result) +{ + int error; + struct file *fp; + + trace_vfs_readdir(dir->fd, entry); + error = fget(dir->fd, &fp); + if (error) { + trace_vfs_readdir_err(error); + } else { + error = sys_readdir(fp, entry); + fdrop(fp); + if (error) { + trace_vfs_readdir_err(error); + } else { + trace_vfs_readdir_ret(); + } + } + // Our dirent has (like Linux) a d_reclen field, but a constant size. + entry->d_reclen = sizeof(*entry); + + if (error) { + *result = nullptr; + } else { + *result = entry; + } + return error == ENOENT ? 0 : error; +} + +// FIXME: in 64bit dirent64 and dirent are identical, so it's safe to alias +#undef readdir64_r +extern "C" int readdir64_r(DIR *dir, struct dirent64 *entry, + struct dirent64 **result) + __attribute__((alias("readdir_r"))); + +#undef readdir64 +extern "C" struct dirent *readdir64(DIR *dir) __attribute__((alias("readdir"))); + +void rewinddir(DIR *dirp) +{ + struct file *fp; + + auto error = fget(dirp->fd, &fp); + if (error) { + // POSIX specifies that what rewinddir() does in the case of error + // is undefined... + return; + } + + sys_rewinddir(fp); + // Again, error code from sys_rewinddir() is ignored. + fdrop(fp); +} + +long telldir(DIR *dirp) +{ + struct file *fp; + int error = fget(dirp->fd, &fp); + if (error) { + return libc_error(error); + } + + long loc; + error = sys_telldir(fp, &loc); + fdrop(fp); + if (error) { + return libc_error(error); + } + return loc; +} + +void seekdir(DIR *dirp, long loc) +{ + struct file *fp; + int error = fget(dirp->fd, &fp); + if (error) { + // POSIX specifies seekdir() cannot return errors. + return; + } + sys_seekdir(fp, loc); + // Again, error code from sys_seekdir() is ignored. + fdrop(fp); +} + +TRACEPOINT(trace_vfs_mkdir, "\"%s\" 0%0o", const char*, mode_t); +TRACEPOINT(trace_vfs_mkdir_ret, ""); +TRACEPOINT(trace_vfs_mkdir_err, "%d", int); + +int +mkdir(const char *pathname, mode_t mode) +{ + struct task *t = main_task; + char path[PATH_MAX]; + int error; + + mode = apply_umask(mode); + + trace_vfs_mkdir(pathname, mode); + if ((error = task_conv(t, pathname, VWRITE, path)) != 0) + goto out_errno; + + error = sys_mkdir(path, mode); + if (error) + goto out_errno; + trace_vfs_mkdir_ret(); + return 0; + out_errno: + trace_vfs_mkdir_err(error); + errno = error; + return -1; +} + +TRACEPOINT(trace_vfs_rmdir, "\"%s\"", const char*); +TRACEPOINT(trace_vfs_rmdir_ret, ""); +TRACEPOINT(trace_vfs_rmdir_err, "%d", int); + +int rmdir(const char *pathname) +{ + struct task *t = main_task; + char path[PATH_MAX]; + int error; + + trace_vfs_rmdir(pathname); + error = ENOENT; + if (pathname == nullptr) + goto out_errno; + if ((error = task_conv(t, pathname, VWRITE, path)) != 0) + goto out_errno; + + error = sys_rmdir(path); + if (error) + goto out_errno; + trace_vfs_rmdir_ret(); + return 0; + out_errno: + trace_vfs_rmdir_err(error); + errno = error; + return -1; +} + +static void +get_last_component(const char *path, char *dst) +{ + int pos = strlen(path) - 1; + + while (pos >= 0 && path[pos] == '/') + pos--; + + int component_end = pos; + + while (pos >= 0 && path[pos] != '/') + pos--; + + int component_start = pos + 1; + + int len = component_end - component_start + 1; + memcpy(dst, path + component_start, len); + dst[len] = 0; +} + +static bool null_or_empty(const char *str) +{ + return str == nullptr || *str == '\0'; +} + +TRACEPOINT(trace_vfs_rename, "\"%s\" \"%s\"", const char*, const char*); +TRACEPOINT(trace_vfs_rename_ret, ""); +TRACEPOINT(trace_vfs_rename_err, "%d", int); + +int rename(const char *oldpath, const char *newpath) +{ + trace_vfs_rename(oldpath, newpath); + struct task *t = main_task; + char src[PATH_MAX]; + char dest[PATH_MAX]; + int error; + + error = ENOENT; + if (null_or_empty(oldpath) || null_or_empty(newpath)) + goto out_errno; + + get_last_component(oldpath, src); + if (!strcmp(src, ".") || !strcmp(src, "..")) { + error = EINVAL; + goto out_errno; + } + + get_last_component(newpath, dest); + if (!strcmp(dest, ".") || !strcmp(dest, "..")) { + error = EINVAL; + goto out_errno; + } + + if ((error = task_conv(t, oldpath, VREAD, src)) != 0) + goto out_errno; + + if ((error = task_conv(t, newpath, VWRITE, dest)) != 0) + goto out_errno; + + error = sys_rename(src, dest); + if (error) + goto out_errno; + trace_vfs_rename_ret(); + return 0; + out_errno: + trace_vfs_rename_err(error); + errno = error; + return -1; +} + +TRACEPOINT(trace_vfs_chdir, "\"%s\"", const char*); +TRACEPOINT(trace_vfs_chdir_ret, ""); +TRACEPOINT(trace_vfs_chdir_err, "%d", int); + +static int replace_cwd(struct task *t, struct file *new_cwdfp, + std::function<int (void)> chdir_func) +{ + struct file *old = nullptr; + + if (!t) { + return 0; + } + + if (t->t_cwdfp) { + old = t->t_cwdfp; + } + + /* Do the actual chdir operation here */ + int error = chdir_func(); + + t->t_cwdfp = new_cwdfp; + if (old) { + fdrop(old); + } + + return error; +} + +int chdir(const char *pathname) +{ + trace_vfs_chdir(pathname); + struct task *t = main_task; + char path[PATH_MAX]; + struct file *fp; + int error; + + error = ENOENT; + if (pathname == nullptr) + goto out_errno; + + if ((error = task_conv(t, pathname, VREAD, path)) != 0) + goto out_errno; + + /* Check if directory exits */ + error = sys_open(path, O_DIRECTORY, 0, &fp); + if (error) { + goto out_errno; + } + + replace_cwd(t, fp, [&]() { strlcpy(t->t_cwd, path, sizeof(t->t_cwd)); return 0; }); + + trace_vfs_chdir_ret(); + return 0; + out_errno: + errno = error; + trace_vfs_chdir_err(errno); + return -1; +} + +TRACEPOINT(trace_vfs_fchdir, "%d", int); +TRACEPOINT(trace_vfs_fchdir_ret, ""); +TRACEPOINT(trace_vfs_fchdir_err, "%d", int); + +int fchdir(int fd) +{ + trace_vfs_fchdir(fd); + struct task *t = main_task; + struct file *fp; + int error; + + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = replace_cwd(t, fp, [&]() { return sys_fchdir(fp, t->t_cwd); }); + if (error) { + fdrop(fp); + goto out_errno; + } + + trace_vfs_fchdir_ret(); + return 0; + + out_errno: + trace_vfs_fchdir_err(error); + errno = error; + return -1; +} + +TRACEPOINT(trace_vfs_link, "\"%s\" \"%s\"", const char*, const char*); +TRACEPOINT(trace_vfs_link_ret, ""); +TRACEPOINT(trace_vfs_link_err, "%d", int); + +int link(const char *oldpath, const char *newpath) +{ + struct task *t = main_task; + char path1[PATH_MAX]; + char path2[PATH_MAX]; + int error; + + trace_vfs_link(oldpath, newpath); + + error = ENOENT; + if (oldpath == nullptr || newpath == nullptr) + goto out_errno; + if ((error = task_conv(t, oldpath, VWRITE, path1)) != 0) + goto out_errno; + if ((error = task_conv(t, newpath, VWRITE, path2)) != 0) + goto out_errno; + + error = sys_link(path1, path2); + if (error) + goto out_errno; + trace_vfs_link_ret(); + return 0; + out_errno: + trace_vfs_link_err(error); + errno = error; + return -1; +} + + +TRACEPOINT(trace_vfs_symlink, "oldpath=%s, newpath=%s", const char*, const char*); +TRACEPOINT(trace_vfs_symlink_ret, ""); +TRACEPOINT(trace_vfs_symlink_err, "errno=%d", int); + +int symlink(const char *oldpath, const char *newpath) +{ + int error; + + trace_vfs_symlink(oldpath, newpath); + + error = ENOENT; + if (oldpath == nullptr || newpath == nullptr) { + errno = ENOENT; + trace_vfs_symlink_err(error); + return (-1); + } + + error = sys_symlink(oldpath, newpath); + if (error) { + errno = error; + trace_vfs_symlink_err(error); + return (-1); + } + + trace_vfs_symlink_ret(); + return 0; +} + +TRACEPOINT(trace_vfs_unlink, "\"%s\"", const char*); +TRACEPOINT(trace_vfs_unlink_ret, ""); +TRACEPOINT(trace_vfs_unlink_err, "%d", int); + +int unlink(const char *pathname) +{ + trace_vfs_unlink(pathname); + struct task *t = main_task; + char path[PATH_MAX]; + int error; + + error = ENOENT; + if (pathname == nullptr) + goto out_errno; + if ((error = task_conv(t, pathname, VWRITE, path)) != 0) + goto out_errno; + + error = sys_unlink(path); + if (error) + goto out_errno; + trace_vfs_unlink_ret(); + return 0; + out_errno: + trace_vfs_unlink_err(error); + errno = error; + return -1; +} + +TRACEPOINT(trace_vfs_stat, "\"%s\" %p", const char*, struct stat*); +TRACEPOINT(trace_vfs_stat_ret, ""); +TRACEPOINT(trace_vfs_stat_err, "%d", int); + +extern "C" +int __xstat(int ver, const char *pathname, struct stat *st) +{ + struct task *t = main_task; + char path[PATH_MAX]; + int error; + + trace_vfs_stat(pathname, st); + + error = task_conv(t, pathname, 0, path); + if (error) + goto out_errno; + + error = sys_stat(path, st); + if (error) + goto out_errno; + trace_vfs_stat_ret(); + return 0; + + out_errno: + trace_vfs_stat_err(error); + errno = error; + return -1; +} + +LFS64(__xstat); + +int stat(const char *pathname, struct stat *st) +{ + return __xstat(1, pathname, st); +} + +LFS64(stat); + +TRACEPOINT(trace_vfs_lstat, "pathname=%s, stat=%p", const char*, struct stat*); +TRACEPOINT(trace_vfs_lstat_ret, ""); +TRACEPOINT(trace_vfs_lstat_err, "errno=%d", int); +extern "C" +int __lxstat(int ver, const char *pathname, struct stat *st) +{ + struct task *t = main_task; + char path[PATH_MAX]; + int error; + + trace_vfs_lstat(pathname, st); + + error = task_conv(t, pathname, 0, path); + if (error) { + errno = error; + trace_vfs_lstat_err(error); + return (-1); + } + + error = sys_lstat(path, st); + if (error) { + errno = error; + trace_vfs_lstat_err(error); + return (-1); + } + + trace_vfs_lstat_ret(); + return 0; +} + +LFS64(__lxstat); + +int lstat(const char *pathname, struct stat *st) +{ + return __lxstat(1, pathname, st); +} + +LFS64(lstat); + +TRACEPOINT(trace_vfs_statfs, "\"%s\" %p", const char*, struct statfs*); +TRACEPOINT(trace_vfs_statfs_ret, ""); +TRACEPOINT(trace_vfs_statfs_err, "%d", int); + +extern "C" +int __statfs(const char *pathname, struct statfs *buf) +{ + trace_vfs_statfs(pathname, buf); + struct task *t = main_task; + char path[PATH_MAX]; + int error; + + error = task_conv(t, pathname, 0, path); + if (error) + goto out_errno; + + error = sys_statfs(path, buf); + if (error) + goto out_errno; + trace_vfs_statfs_ret(); + return 0; + out_errno: + trace_vfs_statfs_err(error); + errno = error; + return -1; +} +weak_alias(__statfs, statfs); + +LFS64(statfs); + +TRACEPOINT(trace_vfs_fstatfs, "\"%s\" %p", int, struct statfs*); +TRACEPOINT(trace_vfs_fstatfs_ret, ""); +TRACEPOINT(trace_vfs_fstatfs_err, "%d", int); + +extern "C" +int __fstatfs(int fd, struct statfs *buf) +{ + struct file *fp; + int error; + + trace_vfs_fstatfs(fd, buf); + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = sys_fstatfs(fp, buf); + fdrop(fp); + + if (error) + goto out_errno; + trace_vfs_fstatfs_ret(); + return 0; + + out_errno: + trace_vfs_fstatfs_err(error); + errno = error; + return -1; +} +weak_alias(__fstatfs, fstatfs); + +LFS64(fstatfs); + +static int +statfs_to_statvfs(struct statvfs *dst, struct statfs *src) +{ + dst->f_bsize = src->f_bsize; + dst->f_frsize = src->f_bsize; + dst->f_blocks = src->f_blocks; + dst->f_bfree = src->f_bfree; + dst->f_bavail = src->f_bavail; + dst->f_files = src->f_files; + dst->f_ffree = src->f_ffree; + dst->f_favail = 0; + dst->f_fsid = src->f_fsid.__val[0]; + dst->f_flag = src->f_flags; + dst->f_namemax = src->f_namelen; + return 0; +} + +int +statvfs(const char *pathname, struct statvfs *buf) +{ + struct statfs st; + + if (__statfs(pathname, &st) < 0) + return -1; + return statfs_to_statvfs(buf, &st); +} + +LFS64(statvfs); + +int +fstatvfs(int fd, struct statvfs *buf) +{ + struct statfs st; + + if (__fstatfs(fd, &st) < 0) + return -1; + return statfs_to_statvfs(buf, &st); +} + +LFS64(fstatvfs); + + +TRACEPOINT(trace_vfs_getcwd, "%p %d", char*, size_t); +TRACEPOINT(trace_vfs_getcwd_ret, "\"%s\"", const char*); +TRACEPOINT(trace_vfs_getcwd_err, "%d", int); + +char *getcwd(char *path, size_t size) +{ + trace_vfs_getcwd(path, size); + struct task *t = main_task; + int len = strlen(t->t_cwd) + 1; + int error; + + if (!path) { + if (!size) + size = len; + path = (char*)malloc(size); + if (!path) { + error = ENOMEM; + goto out_errno; + } + } else { + if (!size) { + error = EINVAL; + goto out_errno; + } + } + + if (size < len) { + error = ERANGE; + goto out_errno; + } + + memcpy(path, t->t_cwd, len); + trace_vfs_getcwd_ret(path); + return path; + + out_errno: + trace_vfs_getcwd_err(error); + errno = error; + return nullptr; +} + +TRACEPOINT(trace_vfs_dup, "%d", int); +TRACEPOINT(trace_vfs_dup_ret, "\"%s\"", int); +TRACEPOINT(trace_vfs_dup_err, "%d", int); +/* + * Duplicate a file descriptor + */ +int dup(int oldfd) +{ + struct file *fp; + int newfd; + int error; + + trace_vfs_dup(oldfd); + error = fget(oldfd, &fp); + if (error) + goto out_errno; + + error = fdalloc(fp, &newfd); + if (error) + goto out_fdrop; + + fdrop(fp); + trace_vfs_dup_ret(newfd); + return newfd; + + out_fdrop: + fdrop(fp); + out_errno: + trace_vfs_dup_err(error); + errno = error; + return -1; +} + +TRACEPOINT(trace_vfs_dup3, "%d %d 0x%x", int, int, int); +TRACEPOINT(trace_vfs_dup3_ret, "%d", int); +TRACEPOINT(trace_vfs_dup3_err, "%d", int); +/* + * Duplicate a file descriptor to a particular value. + */ +int dup3(int oldfd, int newfd, int flags) +{ + struct file *fp; + int error; + + trace_vfs_dup3(oldfd, newfd, flags); + /* + * Don't allow any argument but O_CLOEXEC. But we even ignore + * that as we don't support exec() and thus don't care. + */ + if ((flags & ~O_CLOEXEC) != 0) { + error = EINVAL; + goto out_errno; + } + + if (oldfd == newfd) { + error = EINVAL; + goto out_errno; + } + + error = fget(oldfd, &fp); + if (error) + goto out_errno; + + error = fdset(newfd, fp); + if (error) { + fdrop(fp); + goto out_errno; + } + + fdrop(fp); + trace_vfs_dup3_ret(newfd); + return newfd; + + out_errno: + trace_vfs_dup3_err(error); + errno = error; + return -1; +} + +int dup2(int oldfd, int newfd) +{ + if (oldfd == newfd) + return newfd; + + return dup3(oldfd, newfd, 0); +} + +/* + * The file control system call. + */ +#define SETFL (O_APPEND | O_ASYNC | O_DIRECT | O_NOATIME | O_NONBLOCK) + +TRACEPOINT(trace_vfs_fcntl, "%d %d 0x%x", int, int, int); +TRACEPOINT(trace_vfs_fcntl_ret, "\"%s\"", int); +TRACEPOINT(trace_vfs_fcntl_err, "%d", int); + +extern "C" +int fcntl(int fd, int cmd, int arg) +{ + struct file *fp; + int ret = 0, error; + int tmp; + + trace_vfs_fcntl(fd, cmd, arg); + error = fget(fd, &fp); + if (error) + goto out_errno; + + // An important note about our handling of FD_CLOEXEC / O_CLOEXEC: + // close-on-exec shouldn't have been a file flag (fp->f_flags) - it is a + // file descriptor flag, meaning that that two dup()ed file descriptors + // could have different values for FD_CLOEXEC. Our current implementation + // *wrongly* makes close-on-exec an f_flag (using the bit O_CLOEXEC). + // There is little practical difference, though, because this flag is + // ignored in OSv anyway, as it doesn't support exec(). + switch (cmd) { + case F_DUPFD: + error = _fdalloc(fp, &ret, arg); + if (error) + goto out_errno; + break; + case F_GETFD: + ret = (fp->f_flags & O_CLOEXEC) ? FD_CLOEXEC : 0; + break; + case F_SETFD: + FD_LOCK(fp); + fp->f_flags = (fp->f_flags & ~O_CLOEXEC) | + ((arg & FD_CLOEXEC) ? O_CLOEXEC : 0); + FD_UNLOCK(fp); + break; + case F_GETFL: + // As explained above, the O_CLOEXEC should have been in f_flags, + // and shouldn't be returned. Linux always returns 0100000 ("the + // flag formerly known as O_LARGEFILE) so let's do it too. + ret = (oflags(fp->f_flags) & ~O_CLOEXEC) | 0100000; + break; + case F_SETFL: + FD_LOCK(fp); + fp->f_flags = fflags((oflags(fp->f_flags) & ~SETFL) | + (arg & SETFL)); + FD_UNLOCK(fp); + + /* Sync nonblocking/async state with file flags */ + tmp = fp->f_flags & FNONBLOCK; + fp->ioctl(FIONBIO, &tmp); + tmp = fp->f_flags & FASYNC; + fp->ioctl(FIOASYNC, &tmp); + + break; + case F_SETLK: + WARN_ONCE("fcntl(F_SETLK) stubbed\n"); + break; + case F_GETLK: + WARN_ONCE("fcntl(F_GETLK) stubbed\n"); + break; + case F_SETLKW: + WARN_ONCE("fcntl(F_SETLKW) stubbed\n"); + break; + case F_SETOWN: + WARN_ONCE("fcntl(F_SETOWN) stubbed\n"); + break; + default: + kprintf("unsupported fcntl cmd 0x%x\n", cmd); + error = EINVAL; + } + + fdrop(fp); + if (error) + goto out_errno; + trace_vfs_fcntl_ret(ret); + return ret; + + out_errno: + trace_vfs_fcntl_err(error); + errno = error; + return -1; +} + +TRACEPOINT(trace_vfs_access, "\"%s\" 0%0o", const char*, int); +TRACEPOINT(trace_vfs_access_ret, ""); +TRACEPOINT(trace_vfs_access_err, "%d", int); + +/* + * Check permission for file access + */ +int access(const char *pathname, int mode) +{ + trace_vfs_access(pathname, mode); + struct task *t = main_task; + char path[PATH_MAX]; + int acc, error = 0; + + acc = 0; + if (mode & R_OK) + acc |= VREAD; + if (mode & W_OK) + acc |= VWRITE; + + if ((error = task_conv(t, pathname, acc, path)) != 0) + goto out_errno; + + error = sys_access(path, mode); + if (error) + goto out_errno; + trace_vfs_access_ret(); + return 0; + out_errno: + errno = error; + trace_vfs_access_err(error); + return -1; +} + +int faccessat(int dirfd, const char *pathname, int mode, int flags) +{ + if (flags & AT_SYMLINK_NOFOLLOW) { + UNIMPLEMENTED("faccessat() with AT_SYMLINK_NOFOLLOW"); + } + + if (pathname[0] == '/' || dirfd == AT_FDCWD) { + return access(pathname, mode); + } + + struct file *fp; + int error = fget(dirfd, &fp); + if (error) { + errno = error; + return -1; + } + + struct vnode *vp = fp->f_dentry->d_vnode; + vn_lock(vp); + + std::unique_ptr<char []> up (new char[PATH_MAX]); + char *p = up.get(); + + /* build absolute path */ + strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX); + strlcat(p, fp->f_dentry->d_path, PATH_MAX); + strlcat(p, "/", PATH_MAX); + strlcat(p, pathname, PATH_MAX); + + error = access(p, mode); + + vn_unlock(vp); + fdrop(fp); + + return error; +} + +extern "C" +int euidaccess(const char *pathname, int mode) +{ + return access(pathname, mode); +} + +weak_alias(euidaccess,eaccess); + +#if 0 +static int +fs_pipe(struct task *t, struct msg *msg) +{ +#ifdef CONFIG_FIFOFS + char path[PATH_MAX]; + file_t rfp, wfp; + int error, rfd, wfd; + + DPRINTF(VFSDB_CORE, ("fs_pipe\n")); + + if ((rfd = task_newfd(t)) == -1) + return EMFILE; + t->t_ofile[rfd] = (file_t)1; /* temp */ + + if ((wfd = task_newfd(t)) == -1) { + t->t_ofile[rfd] = nullptr; + return EMFILE; + } + sprintf(path, "/mnt/fifo/pipe-%x-%d", (u_int)t->t_taskid, rfd); + + if ((error = sys_mknod(path, S_IFIFO)) != 0) + goto out; + if ((error = sys_open(path, O_RDONLY | O_NONBLOCK, 0, &rfp)) != 0) { + goto out; + } + if ((error = sys_open(path, O_WRONLY | O_NONBLOCK, 0, &wfp)) != 0) { + goto out; + } + t->t_ofile[rfd] = rfp; + t->t_ofile[wfd] = wfp; + t->t_nopens += 2; + msg->data[0] = rfd; + msg->data[1] = wfd; + return 0; + out: + t->t_ofile[rfd] = nullptr; + t->t_ofile[wfd] = nullptr; + return error; +#else + return ENOSYS; +#endif +} +#endif + +TRACEPOINT(trace_vfs_isatty, "%d", int); +TRACEPOINT(trace_vfs_isatty_ret, "%d", int); +TRACEPOINT(trace_vfs_isatty_err, "%d", int); + +/* + * Return if specified file is a tty + */ +int isatty(int fd) +{ + struct file *fp; + int istty = 0; + + trace_vfs_isatty(fd); + fileref f(fileref_from_fd(fd)); + if (!f) { + errno = EBADF; + trace_vfs_isatty_err(errno); + return -1; + } + + fp = f.get(); + if (dynamic_cast<tty_file*>(fp) || + (fp->f_dentry && fp->f_dentry->d_vnode->v_flags & VISTTY)) { + istty = 1; + } + + trace_vfs_isatty_ret(istty); + return istty; +} + +TRACEPOINT(trace_vfs_truncate, "\"%s\" 0x%x", const char*, off_t); +TRACEPOINT(trace_vfs_truncate_ret, ""); +TRACEPOINT(trace_vfs_truncate_err, "%d", int); + +int truncate(const char *pathname, off_t length) +{ + trace_vfs_truncate(pathname, length); + struct task *t = main_task; + char path[PATH_MAX]; + int error; + + error = ENOENT; + if (pathname == nullptr) + goto out_errno; + if ((error = task_conv(t, pathname, VWRITE, path)) != 0) + goto out_errno; + + error = sys_truncate(path, length); + if (error) + goto out_errno; + trace_vfs_truncate_ret(); + return 0; + out_errno: + errno = error; + trace_vfs_truncate_err(error); + return -1; +} + +LFS64(truncate); + +TRACEPOINT(trace_vfs_ftruncate, "%d 0x%x", int, off_t); +TRACEPOINT(trace_vfs_ftruncate_ret, ""); +TRACEPOINT(trace_vfs_ftruncate_err, "%d", int); + +int ftruncate(int fd, off_t length) +{ + trace_vfs_ftruncate(fd, length); + struct file *fp; + int error; + + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = sys_ftruncate(fp, length); + fdrop(fp); + + if (error) + goto out_errno; + trace_vfs_ftruncate_ret(); + return 0; + + out_errno: + errno = error; + trace_vfs_ftruncate_err(error); + return -1; +} + +LFS64(ftruncate); + +ssize_t readlink(const char *pathname, char *buf, size_t bufsize) +{ + struct task *t = main_task; + char path[PATH_MAX]; + int error; + ssize_t size; + + error = -EINVAL; + if (bufsize <= 0) + goto out_errno; + + error = ENOENT; + if (pathname == nullptr) + goto out_errno; + error = task_conv(t, pathname, VWRITE, path); + if (error) + goto out_errno; + + size = 0; + error = sys_readlink(path, buf, bufsize, &size); + + if (error != 0) + goto out_errno; + + return size; + out_errno: + errno = error; + return -1; +} + +TRACEPOINT(trace_vfs_fallocate, "%d %d 0x%x 0x%x", int, int, loff_t, loff_t); +TRACEPOINT(trace_vfs_fallocate_ret, ""); +TRACEPOINT(trace_vfs_fallocate_err, "%d", int); + +int fallocate(int fd, int mode, loff_t offset, loff_t len) +{ + struct file *fp; + int error; + + trace_vfs_fallocate(fd, mode, offset, len); + error = fget(fd, &fp); + if (error) + goto out_errno; + + error = sys_fallocate(fp, mode, offset, len); + fdrop(fp); + + if (error) + goto out_errno; + trace_vfs_fallocate_ret(); + return 0; + + out_errno: + trace_vfs_fallocate_err(error); + errno = error; + return -1; +} + +LFS64(fallocate); + +TRACEPOINT(trace_vfs_utimes, "\"%s\"", const char*); +TRACEPOINT(trace_vfs_utimes_ret, ""); +TRACEPOINT(trace_vfs_utimes_err, "%d", int); + +int futimes(int fd, const struct timeval times[2]) +{ + return futimesat(fd, nullptr, times); +} + +int futimesat(int dirfd, const char *pathname, const struct timeval times[2]) +{ + struct stat st; + struct file *fp; + int error; + char *absolute_path; + + if ((pathname && pathname[0] == '/') || dirfd == AT_FDCWD) + return utimes(pathname, times); + + // Note: if pathname == nullptr, futimesat operates on dirfd itself, and in + // that case it doesn't have to be a directory. + if (pathname) { + error = fstat(dirfd, &st); + if (error) { + error = errno; + goto out_errno; + } + + if (!S_ISDIR(st.st_mode)){ + error = ENOTDIR; + goto out_errno; + } + } + + error = fget(dirfd, &fp); + if (error) + goto out_errno; + + /* build absolute path */ + absolute_path = (char*)malloc(PATH_MAX); + strlcpy(absolute_path, fp->f_dentry->d_mount->m_path, PATH_MAX); + strlcat(absolute_path, fp->f_dentry->d_path, PATH_MAX); + + if (pathname) { + strlcat(absolute_path, "/", PATH_MAX); + strlcat(absolute_path, pathname, PATH_MAX); + } + + error = utimes(absolute_path, times); + free(absolute_path); + + fdrop(fp); + + if (error) + goto out_errno; + return 0; + + out_errno: + errno = error; + return -1; +} + +TRACEPOINT(trace_vfs_utimensat, "\"%s\"", const char*); +TRACEPOINT(trace_vfs_utimensat_ret, ""); +TRACEPOINT(trace_vfs_utimensat_err, "%d", int); + +extern "C" +int utimensat(int dirfd, const char *pathname, const struct timespec times[2], int flags) +{ + trace_vfs_utimensat(pathname); + + auto error = sys_utimensat(dirfd, pathname, times, flags); + if (error) { + trace_vfs_utimensat_err(error); + errno = error; + return -1; + } + + trace_vfs_utimensat_ret(); + return 0; +} + +TRACEPOINT(trace_vfs_futimens, "%d", int); +TRACEPOINT(trace_vfs_futimens_ret, ""); +TRACEPOINT(trace_vfs_futimens_err, "%d", int); + +extern "C" +int futimens(int fd, const struct timespec times[2]) +{ + trace_vfs_futimens(fd); + + auto error = sys_futimens(fd, times); + if (error) { + trace_vfs_futimens_err(error); + errno = error; + return -1; + } + + trace_vfs_futimens_ret(); + return 0; +} + +static int do_utimes(const char *pathname, const struct timeval times[2], int flags) +{ + struct task *t = main_task; + char path[PATH_MAX]; + int error; + + trace_vfs_utimes(pathname); + + error = task_conv(t, pathname, 0, path); + if (error) { + trace_vfs_utimes_err(error); + return libc_error(error); + } + + error = sys_utimes(path, times, flags); + if (error) { + trace_vfs_utimes_err(error); + return libc_error(error); + } + + trace_vfs_utimes_ret(); + return 0; +} + +extern "C" +int utimes(const char *pathname, const struct timeval times[2]) +{ + return do_utimes(pathname, times, 0); +} + +extern "C" +int lutimes(const char *pathname, const struct timeval times[2]) +{ + return do_utimes(pathname, times, AT_SYMLINK_NOFOLLOW); +} + +extern "C" +int utime(const char *pathname, const struct utimbuf *t) +{ + using namespace std::chrono; + + struct timeval times[2]; + times[0].tv_usec = 0; + times[1].tv_usec = 0; + if (!t) { + long int tsec = duration_cast<seconds>(osv::clock::wall::now().time_since_epoch()).count(); + times[0].tv_sec = tsec; + times[1].tv_sec = tsec; + } else { + times[0].tv_sec = t->actime; + times[1].tv_sec = t->modtime; + } + + return utimes(pathname, times); +} + +TRACEPOINT(trace_vfs_chmod, "\"%s\" 0%0o", const char*, mode_t); +TRACEPOINT(trace_vfs_chmod_ret, ""); +TRACEPOINT(trace_vfs_chmod_err, "%d", int); + +int chmod(const char *pathname, mode_t mode) +{ + trace_vfs_chmod(pathname, mode); + struct task *t = main_task; + char path[PATH_MAX]; + int error = ENOENT; + if (pathname == nullptr) + goto out_errno; + if ((error = task_conv(t, pathname, VWRITE, path)) != 0) + goto out_errno; + error = sys_chmod(path, mode & ALLPERMS); + if (error) + goto out_errno; + trace_vfs_chmod_ret(); + return 0; +out_errno: + trace_vfs_chmod_err(error); + errno = error; + return -1; +} + +TRACEPOINT(trace_vfs_fchmod, "\"%d\" 0%0o", int, mode_t); +TRACEPOINT(trace_vfs_fchmod_ret, ""); + +int fchmod(int fd, mode_t mode) +{ + trace_vfs_fchmod(fd, mode); + auto error = sys_fchmod(fd, mode & ALLPERMS); + trace_vfs_fchmod_ret(); + if (error) { + errno = error; + return -1; + } else { + return 0; + } +} + +TRACEPOINT(trace_vfs_fchown, "\"%d\" %d %d", int, uid_t, gid_t); +TRACEPOINT(trace_vfs_fchown_ret, ""); + +int fchown(int fd, uid_t owner, gid_t group) +{ + trace_vfs_fchown(fd, owner, group); + WARN_STUBBED(); + trace_vfs_fchown_ret(); + return 0; +} + +int chown(const char *path, uid_t owner, gid_t group) +{ + WARN_STUBBED(); + return 0; +} + +int lchown(const char *path, uid_t owner, gid_t group) +{ + WARN_STUBBED(); + return 0; +} + + +ssize_t sendfile(int out_fd, int in_fd, off_t *_offset, size_t count) +{ + struct file *in_fp; + struct file *out_fp; + fileref in_f{fileref_from_fd(in_fd)}; + fileref out_f{fileref_from_fd(out_fd)}; + + if (!in_f || !out_f) { + return libc_error(EBADF); + } + + in_fp = in_f.get(); + out_fp = out_f.get(); + + if (!in_fp->f_dentry) { + return libc_error(EBADF); + } + + if (!(in_fp->f_flags & FREAD)) { + return libc_error(EBADF); + } + + if (out_fp->f_type & DTYPE_VNODE) { + if (!out_fp->f_dentry) { + return libc_error(EBADF); + } else if (!(out_fp->f_flags & FWRITE)) { + return libc_error(EBADF); + } + } + + off_t offset ; + + if (_offset != nullptr) { + offset = *_offset; + } else { + /* if _offset is nullptr, we need to read from the present position of in_fd */ + offset = lseek(in_fd, 0, SEEK_CUR); + } + + // Constrain count to the extent of the file... + struct stat st; + if (fstat(in_fd, &st) < 0) { + return -1; + } else { + if (offset >= st.st_size) { + return 0; + } else if ((offset + count) >= st.st_size) { + count = st.st_size - offset; + if (count == 0) { + return 0; + } + } + } + + size_t bytes_to_mmap = count + (offset % mmu::page_size); + off_t offset_for_mmap = align_down(offset, (off_t)mmu::page_size); + + char *src = static_cast<char *>(mmap(nullptr, bytes_to_mmap, PROT_READ, MAP_SHARED, in_fd, offset_for_mmap)); + + if (src == MAP_FAILED) { + return -1; + } + + auto ret = write(out_fd, src + (offset % PAGESIZE), count); + + if (ret < 0) { + return libc_error(errno); + } else if(_offset == nullptr) { + lseek(in_fd, ret, SEEK_CUR); + } else { + *_offset += ret; + } + + assert(munmap(src, count) == 0); + + return ret; +} + +#undef sendfile64 +LFS64(sendfile); + +NO_SYS(int fchmodat(int dirfd, const char *pathname, mode_t mode, int flags)); + +mode_t umask(mode_t newmask) +{ + return global_umask.exchange(newmask, std::memory_order_relaxed); +} + +int +fs_noop(void) +{ + return 0; +} + +int chroot(const char *path) +{ + WARN_STUBBED(); + errno = ENOSYS; + return -1; +} + +// unpack_bootfs() unpacks a collection of files stored as part of the OSv +// executable (in memory location "bootfs_start") into the file system, +// normally the in-memory filesystem ramfs. +// The files are packed in the executable in an ad-hoc format defined here. +// Code in scripts/mkbootfs.py packs files into this format. +#define BOOTFS_PATH_MAX 111 +enum class bootfs_file_type : char { other = 0, symlink = 1 }; +struct bootfs_metadata { + uint64_t size; + uint64_t offset; + // The file's type. Can be "symlink" or "other". A directory is an "other" + // file with its name ending with a "/" (and no content). + bootfs_file_type type; + // name must end with a null. For symlink files, the content must end + // with a null as well. + char name[BOOTFS_PATH_MAX]; +}; + +extern char bootfs_start; + +int ramfs_set_file_data(struct vnode *vp, const void *data, size_t size); +void unpack_bootfs(void) +{ + struct bootfs_metadata *md = (struct bootfs_metadata *)&bootfs_start; + int fd, i; + + for (i = 0; md[i].name[0]; i++) { + int ret; + char *p; + + // mkdir() directories needed for this path name, as necessary + char tmp[BOOTFS_PATH_MAX]; + strlcpy(tmp, md[i].name, BOOTFS_PATH_MAX); + for (p = tmp; *p; ++p) { + if (*p == '/') { + *p = '\0'; + mkdir(tmp, 0666); // silently ignore errors and existing dirs + *p = '/'; + } + } + + if (md[i].type == bootfs_file_type::symlink) { + // This is a symbolic link record. The file's content is the + // target path, and we assume ends with a null. + if (symlink(&bootfs_start + md[i].offset, md[i].name) != 0) { + kprintf("couldn't symlink %s: %d\n", md[i].name, errno); + sys_panic("unpack_bootfs failed"); + } + continue; + } + if (*(p-1) == '/' && md[i].size == 0) { + // This is directory record. Nothing else to do + continue; + } + + fd = creat(md[i].name, 0666); + if (fd < 0) { + kprintf("couldn't create %s: %d\n", + md[i].name, errno); + sys_panic("unpack_bootfs failed"); + } + + struct file *fp; + int error = fget(fd, &fp); + if (error) { + kprintf("couldn't fget %s: %d\n", + md[i].name, error); + sys_panic("unpack_bootfs failed"); + } + + struct vnode *vp = fp->f_dentry->d_vnode; + ret = ramfs_set_file_data(vp, &bootfs_start + md[i].offset, md[i].size); + if (ret) { + kprintf("ramfs_set_file_data failed, ret = %d\n", ret); + sys_panic("unpack_bootfs failed"); + } + + fdrop(fp); + close(fd); + } +} + +void mount_rootfs(void) +{ + int ret; + + ret = sys_mount("", "/", "ramfs", 0, nullptr); + if (ret) + kprintf("failed to mount rootfs, error = %s\n", strerror(ret)); + + if (mkdir("/dev", 0755) < 0) + kprintf("failed to create /dev, error = %s\n", strerror(errno)); + + ret = sys_mount("", "/dev", "devfs", 0, nullptr); + if (ret) + kprintf("failed to mount devfs, error = %s\n", strerror(ret)); +} + +extern "C" +int nmount(struct iovec *iov, unsigned niov, int flags) +{ + struct args { + char* fstype = nullptr; + char* fspath = nullptr; + char* from = nullptr; + }; + static unordered_map<string, char* args::*> argmap { + { "fstype", &args::fstype }, + { "fspath", &args::fspath }, + { "from", &args::from }, + }; + args a; + for (size_t i = 0; i < niov; i += 2) { + std::string s(static_cast<const char*>(iov[i].iov_base)); + if (argmap.count(s)) { + a.*(argmap[s]) = static_cast<char*>(iov[i+1].iov_base); + } + } + return sys_mount(a.from, a.fspath, a.fstype, flags, nullptr); +} + +static void import_extra_zfs_pools(void) +{ + struct stat st; + int ret; + + // The file '/etc/mnttab' is a LibZFS requirement and will not + // exist during cpiod phase. The functionality provided by this + // function isn't needed during that phase, so let's skip it. + if (stat("/etc/mnttab" , &st) != 0) { + return; + } + + // Import extra pools mounting datasets there contained. + // Datasets from osv pool will not be mounted here. + if (access("zpool.so", X_OK) != 0) { + return; + } + vector<string> zpool_args = {"zpool", "import", "-f", "-a" }; + auto ok = osv::run("zpool.so", zpool_args, &ret); + assert(ok); + + if (!ret) { + debug("zfs: extra ZFS pool(s) found.\n"); + } +} + +void pivot_rootfs(const char* path) +{ + int ret = sys_pivot_root(path, "/"); + if (ret) + kprintf("failed to pivot root, error = %s\n", strerror(ret)); + + auto ent = setmntent("/etc/fstab", "r"); + if (!ent) { + return; + } + + struct mntent *m = nullptr; + while ((m = getmntent(ent)) != nullptr) { + if (!strcmp(m->mnt_dir, "/")) { + continue; + } + + if ((m->mnt_opts != nullptr) && strcmp(m->mnt_opts, MNTOPT_DEFAULTS)) { + printf("Warning: opts %s, ignored for fs %s\n", m->mnt_opts, m->mnt_type); + } + + // FIXME: Right now, ignoring mntops. In the future we may have an option parser + ret = sys_mount(m->mnt_fsname, m->mnt_dir, m->mnt_type, 0, nullptr); + if (ret) { + printf("failed to mount %s, error = %s\n", m->mnt_type, strerror(ret)); + } + } + endmntent(ent); +} + +extern "C" void unmount_devfs() +{ + int ret = sys_umount("/dev"); + if (ret) + kprintf("failed to unmount /dev, error = %s\n", strerror(ret)); +} + +extern "C" int mount_rofs_rootfs(bool pivot_root) +{ + int ret; + + if (mkdir("/rofs", 0755) < 0) + kprintf("failed to create /rofs, error = %s\n", strerror(errno)); + + ret = sys_mount("/dev/vblk0.1", "/rofs", "rofs", MNT_RDONLY, 0); + + if (ret) { + kprintf("failed to mount /rofs, error = %s\n", strerror(ret)); + rmdir("/rofs"); + return ret; + } + + if (pivot_root) { + pivot_rootfs("/rofs"); + } + + return 0; +} + +extern "C" void mount_zfs_rootfs(bool pivot_root) +{ + if (mkdir("/zfs", 0755) < 0) + kprintf("failed to create /zfs, error = %s\n", strerror(errno)); + + int ret = sys_mount("/dev/vblk0.1", "/zfs", "zfs", 0, (void *)"osv/zfs"); + + if (ret) + kprintf("failed to mount /zfs, error = %s\n", strerror(ret)); + + if (!pivot_root) { + return; + } + + pivot_rootfs("/zfs"); + + import_extra_zfs_pools(); +} + +extern "C" void unmount_rootfs(void) +{ + int ret; + + sys_umount("/dev"); + + ret = sys_umount("/proc"); + if (ret) { + kprintf("Warning: unmount_rootfs: failed to unmount /proc, " + "error = %s\n", strerror(ret)); + } + + ret = sys_umount2("/", MNT_FORCE); + if (ret) { + kprintf("Warning: unmount_rootfs: failed to unmount /, " + "error = %s\n", strerror(ret)); + } +} + +extern "C" void bio_init(void); +extern "C" void bio_sync(void); + +int vfs_initialized; + +extern "C" +void +vfs_init(void) +{ + const struct vfssw *fs; + + bio_init(); + lookup_init(); + vnode_init(); + task_alloc(&main_task); + + /* + * Initialize each file system. + */ + for (fs = vfssw; fs->vs_name; fs++) { + if (fs->vs_init) { + DPRINTF(VFSDB_CORE, ("VFS: initializing %s\n", + fs->vs_name)); + fs->vs_init(); + } + } + + mount_rootfs(); + unpack_bootfs(); + + // if (open("/dev/console", O_RDWR, 0) != 0) + if (console::open() != 0) + kprintf("failed to open console, error = %d\n", errno); + if (dup(0) != 1) + kprintf("failed to dup console (1)\n"); + if (dup(0) != 2) + kprintf("failed to dup console (2)\n"); + vfs_initialized = 1; +} + +void vfs_exit(void) +{ + /* Free up main_task (stores cwd data) resources */ + replace_cwd(main_task, nullptr, []() { return 0; }); + /* Unmount all file systems */ + unmount_rootfs(); + /* Finish with the bio layer */ + bio_sync(); +} + +void sys_panic(const char *str) +{ + abort("panic: %s", str); +} + diff --git a/lib/vfscore/mount.c b/lib/vfscore/mount.c new file mode 100644 index 00000000..dac4d09c --- /dev/null +++ b/lib/vfscore/mount.c @@ -0,0 +1,491 @@ +/* + * Copyright (c) 2005-2007, Kohsuke Ohtani + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * vfs_mount.c - mount operations + */ + +#include <sys/stat.h> +#include <sys/param.h> +#include <dirent.h> + +#include <limits.h> +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> + +#include <osv/prex.h> +#include <osv/vnode.h> +#include <osv/device.h> +#include <osv/debug.h> +#include <osv/mutex.h> +#include "vfs.h" + +#include <memory> +#include <list> + +/* + * List for VFS mount points. + */ +static std::list<mount*> mount_list; + +/* + * Global lock to access mount point. + */ +static mutex mount_lock; + +/* + * Lookup file system. + */ +static const struct vfssw * +fs_getfs(const char *name) +{ + const struct vfssw *fs; + + for (fs = vfssw; fs->vs_name; fs++) { + if (!strncmp(name, fs->vs_name, FSMAXNAMES)) + break; + } + if (!fs->vs_name) + return nullptr; + return fs; +} + +const char* +fs_getfsname(vfsops* ops) +{ + for (auto fs = vfssw; fs->vs_name; fs++) { + if (fs->vs_op == ops) { + return fs->vs_name; + } + } + abort(); +} + +int +sys_mount(const char *dev, const char *dir, const char *fsname, int flags, const void *data) +{ + const struct vfssw *fs; + struct mount *mp; + struct device *device; + struct dentry *dp_covered; + struct vnode *vp; + int error; + + kprintf("VFS: mounting %s at %s\n", fsname, dir); + + if (!dir || *dir == '\0') + return ENOENT; + + /* Find a file system. */ + if (!(fs = fs_getfs(fsname))) + return ENODEV; /* No such file system */ + + /* Open device. nullptr can be specified as a device. */ + // Allow device_open() to fail, in which case dev is interpreted + // by the file system mount routine (e.g zfs pools) + device = 0; + if (dev && strncmp(dev, "/dev/", 5) == 0) + device_open(dev + 5, DO_RDWR, &device); + + /* Check if device or directory has already been mounted. */ + // We need to avoid the situation where after we already verified that + // the mount point is free, but before we actually add it to mount_list, + // another concurrent mount adds it. So we use a new mutex to ensure + // that only one sys_mount() runs at a time. We cannot reuse the existing + // mount_lock for this purpose: If we take mount_lock and then do + // lookups, this is lock order inversion and can result in deadlock. + static mutex sys_mount_lock; + SCOPE_LOCK(sys_mount_lock); + WITH_LOCK(mount_lock) { + for (auto&& mp : mount_list) { + if (!strcmp(mp->m_path, dir) || + (device && mp->m_dev == device)) { + error = EBUSY; /* Already mounted */ + goto err1; + } + } + } + /* + * Create VFS mount entry. + */ + if (!(mp = new mount)) { + error = ENOMEM; + goto err1; + } + mp->m_count = 0; + mp->m_op = fs->vs_op; + mp->m_flags = flags; + mp->m_dev = device; + mp->m_data = nullptr; + strlcpy(mp->m_path, dir, sizeof(mp->m_path)); + strlcpy(mp->m_special, dev, sizeof(mp->m_special)); + + /* + * Get vnode to be covered in the upper file system. + */ + if (*dir == '/' && *(dir + 1) == '\0') { + /* Ignore if it mounts to global root directory. */ + dp_covered = nullptr; + } else { + if ((error = namei(dir, &dp_covered)) != 0) { + + error = ENOENT; + goto err2; + } + if (dp_covered->d_vnode->v_type != VDIR) { + error = ENOTDIR; + goto err3; + } + } + mp->m_covered = dp_covered; + + /* + * Create a root vnode for this file system. + */ + vget(mp, 0, &vp); + if (vp == nullptr) { + error = ENOMEM; + goto err3; + } + vp->v_type = VDIR; + vp->v_flags = VROOT; + vp->v_mode = S_IFDIR | S_IRUSR | S_IWUSR | S_IXUSR; + + mp->m_root = dentry_alloc(nullptr, vp, "/"); + if (!mp->m_root) { + vput(vp); + goto err3; + } + vput(vp); + + /* + * Call a file system specific routine. + */ + if ((error = VFS_MOUNT(mp, dev, flags, data)) != 0) + goto err4; + + if (mp->m_flags & MNT_RDONLY) + vp->v_mode &=~S_IWUSR; + + /* + * Insert to mount list + */ + WITH_LOCK(mount_lock) { + mount_list.push_back(mp); + } + + return 0; /* success */ + err4: + drele(mp->m_root); + err3: + if (dp_covered) + drele(dp_covered); + err2: + delete mp; + err1: + if (device) + device_close(device); + + return error; +} + +void +release_mp_dentries(struct mount *mp) +{ + /* Decrement referece count of root vnode */ + if (mp->m_covered) { + drele(mp->m_covered); + } + + /* Release root dentry */ + drele(mp->m_root); +} + +int +sys_umount2(const char *path, int flags) +{ + struct mount *mp; + int error, pathlen; + + kprintf("VFS: unmounting %s\n", path); + + SCOPE_LOCK(mount_lock); + + pathlen = strlen(path); + if (pathlen >= MAXPATHLEN) { + error = ENAMETOOLONG; + goto out; + } + + /* Get mount entry */ + for (auto&& tmp : mount_list) { + if (!strcmp(path, tmp->m_path)) { + mp = tmp; + goto found; + } + } + + error = EINVAL; + goto out; + +found: + /* + * Root fs can not be unmounted. + */ + if (mp->m_covered == nullptr && !(flags & MNT_FORCE)) { + error = EINVAL; + goto out; + } + + if ((error = VFS_UNMOUNT(mp, flags)) != 0) + goto out; + mount_list.remove(mp); + +#ifdef HAVE_BUFFERS + /* Flush all buffers */ + binval(mp->m_dev); +#endif + + if (mp->m_dev) + device_close(mp->m_dev); + delete mp; + out: + return error; +} + +int +sys_umount(const char *path) +{ + return sys_umount2(path, 0); +} + +int +sys_pivot_root(const char *new_root, const char *put_old) +{ + struct mount *newmp = nullptr, *oldmp = nullptr; + int error; + + WITH_LOCK(mount_lock) { + for (auto&& mp : mount_list) { + if (!strcmp(mp->m_path, new_root)) { + newmp = mp; + } + if (!strcmp(mp->m_path, put_old)) { + oldmp = mp; + } + } + if (!newmp || !oldmp || newmp == oldmp) { + return EINVAL; + } + for (auto&& mp : mount_list) { + if (mp == newmp || mp == oldmp) { + continue; + } + if (!strncmp(mp->m_path, put_old, strlen(put_old))) { + return EBUSY; + } + } + if ((error = VFS_UNMOUNT(oldmp, 0)) != 0) { + return error; + } + mount_list.remove(oldmp); + + newmp->m_root->d_vnode->v_mount = newmp; + + if (newmp->m_covered) { + drele(newmp->m_covered); + } + newmp->m_covered = nullptr; + + if (newmp->m_root->d_parent) { + drele(newmp->m_root->d_parent); + } + newmp->m_root->d_parent = nullptr; + + strlcpy(newmp->m_path, "/", sizeof(newmp->m_path)); + } + return 0; +} + +int +sys_sync(void) +{ + /* Call each mounted file system. */ + WITH_LOCK(mount_lock) { + for (auto&& mp : mount_list) { + VFS_SYNC(mp); + } + } +#ifdef HAVE_BUFFERS + bio_sync(); +#endif + return 0; +} + +/* + * Compare two path strings. Return matched length. + * @path: target path. + * @root: vfs root path as mount point. + */ +static size_t +count_match(const char *path, char *mount_root) +{ + size_t len = 0; + + while (*path && *mount_root) { + if (*path != *mount_root) + break; + + path++; + mount_root++; + len++; + } + if (*mount_root != '\0') + return 0; + + if (len == 1 && *(path - 1) == '/') + return 1; + + if (*path == '\0' || *path == '/') + return len; + return 0; +} + +/* + * Get the root directory and mount point for specified path. + * @path: full path. + * @mp: mount point to return. + * @root: pointer to root directory in path. + */ +int +vfs_findroot(const char *path, struct mount **mp, char **root) +{ + struct mount *m = nullptr; + size_t len, max_len = 0; + + if (!path) + return -1; + + /* Find mount point from nearest path */ + SCOPE_LOCK(mount_lock); + for (auto&& tmp : mount_list) { + len = count_match(path, tmp->m_path); + if (len > max_len) { + max_len = len; + m = tmp; + } + } + if (m == nullptr) + return -1; + *root = (char *)(path + max_len); + if (**root == '/') + (*root)++; + *mp = m; + return 0; +} + +/* + * Mark a mount point as busy. + */ +void +vfs_busy(struct mount *mp) +{ + SCOPE_LOCK(mount_lock); + mp->m_count++; +} + + +/* + * Mark a mount point as busy. + */ +void +vfs_unbusy(struct mount *mp) +{ + SCOPE_LOCK(mount_lock); + mp->m_count--; +} + +int +vfs_nullop(void) +{ + return 0; +} + +int +vfs_einval(void) +{ + return EINVAL; +} + +namespace osv { + +mount_desc to_mount_desc(mount* m) +{ + mount_desc ret; + ret.special = m->m_special; + ret.path = m->m_path; + ret.type = fs_getfsname(m->m_op); + // FIXME: record options + ret.options = ""; + return ret; +} + +std::vector<mount_desc> +current_mounts() +{ + WITH_LOCK(mount_lock) { + std::vector<mount_desc> ret; + for (auto&& mp : mount_list) { + ret.push_back(to_mount_desc(mp)); + } + return ret; + } +} + +} + +#ifdef DEBUG_VFS +void +mount_dump(void) +{ + SCOPE_LOCK(mount_lock); + + kprintf("mount_dump\n"); + kprintf("dev count root\n"); + kprintf("-------- ----- --------\n"); + + for (auto&& mp : mount_list) { + kprintf("%8x %5d %s\n", mp->m_dev, mp->m_count, mp->m_path); + } +} +#endif diff --git a/lib/vfscore/subr_uio.c b/lib/vfscore/subr_uio.c new file mode 100644 index 00000000..bf138b8e --- /dev/null +++ b/lib/vfscore/subr_uio.c @@ -0,0 +1,73 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 + */ + +#include <assert.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <osv/uio.h> + +int +uiomove(void *cp, int n, struct uio *uio) +{ + assert(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE); + + while (n > 0 && uio->uio_resid) { + struct iovec *iov = uio->uio_iov; + int cnt = iov->iov_len; + if (cnt == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + if (cnt > n) + cnt = n; + + if (uio->uio_rw == UIO_READ) + memcpy(iov->iov_base, cp, cnt); + else + memcpy(cp, iov->iov_base, cnt); + + iov->iov_base = (char *)iov->iov_base + cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + cp = (char *)cp + cnt; + n -= cnt; + } + + return 0; +} diff --git a/lib/vfscore/syscalls.c b/lib/vfscore/syscalls.c new file mode 100644 index 00000000..487d5729 --- /dev/null +++ b/lib/vfscore/syscalls.c @@ -0,0 +1,1486 @@ +/* + * Copyright (C) 2013 Cloudius Systems, Ltd. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +/* + * Copyright (c) 2005-2007, Kohsuke Ohtani + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * vfs_syscalls.c - everything in this file is a routine implementing + * a VFS system call. + */ + +#include <sys/stat.h> +#include <dirent.h> + +#include <limits.h> +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> + +#include <osv/prex.h> +#include <osv/vnode.h> +#include <osv/vfs_file.hh> +#include "vfs.h" +#include <fs/fs.hh> + +extern struct task *main_task; + +static int +open_no_follow_chk(char *path) +{ + int error; + struct dentry *ddp; + char *name; + struct dentry *dp; + struct vnode *vp; + + ddp = nullptr; + dp = nullptr; + vp = nullptr; + + error = lookup(path, &ddp, &name); + if (error) { + return (error); + } + + error = namei_last_nofollow(path, ddp, &dp); + if (error) { + goto out; + } + + vp = dp->d_vnode; + vn_lock(vp); + if (vp->v_type == VLNK) { + error = ELOOP; + goto out; + } + + error = 0; +out: + if (vp != nullptr) { + vn_unlock(vp); + } + + if (dp != nullptr) { + drele(dp); + } + + if (ddp != nullptr) { + drele(ddp); + } + + return (error); +} + +int +sys_open(char *path, int flags, mode_t mode, struct file **fpp) +{ + file *fp; + struct dentry *dp, *ddp; + struct vnode *vp; + char *filename; + int error; + + DPRINTF(VFSDB_SYSCALL, ("sys_open: path=%s flags=%x mode=%x\n", + path, flags, mode)); + + flags = fflags(flags); + if (flags & O_CREAT) { + error = namei(path, &dp); + if (error == ENOENT) { + /* Create new file. */ + if ((error = lookup(path, &ddp, &filename)) != 0) + return error; + + vn_lock(ddp->d_vnode); + if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0) { + vn_unlock(ddp->d_vnode); + drele(ddp); + return error; + } + mode &= ~S_IFMT; + mode |= S_IFREG; + error = VOP_CREATE(ddp->d_vnode, filename, mode); + vn_unlock(ddp->d_vnode); + drele(ddp); + + if (error) + return error; + if ((error = namei(path, &dp)) != 0) + return error; + + vp = dp->d_vnode; + flags &= ~O_TRUNC; + } else if (error) { + return error; + } else { + /* File already exits */ + if (flags & O_EXCL) { + error = EEXIST; + goto out_drele; + } + } + + vp = dp->d_vnode; + flags &= ~O_CREAT; + } else { + /* Open */ + if (flags & O_NOFOLLOW) { + error = open_no_follow_chk(path); + if (error != 0) { + return (error); + } + } + error = namei(path, &dp); + if (error) + return error; + + vp = dp->d_vnode; + + if (flags & FWRITE || flags & O_TRUNC) { + error = vn_access(vp, VWRITE); + if (error) + goto out_drele; + + error = EISDIR; + if (vp->v_type == VDIR) + goto out_drele; + } + if (flags & O_DIRECTORY) { + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out_drele; + } + } + } + + vn_lock(vp); + /* Process truncate request */ + if (flags & O_TRUNC) { + error = EINVAL; + if (!(flags & FWRITE) || vp->v_type == VDIR) + goto out_vn_unlock; + + error = VOP_TRUNCATE(vp, 0); + if (error) + goto out_vn_unlock; + } + + try { + fileref f = make_file<vfs_file>(flags); + fp = f.get(); + fhold(fp); + } catch (int err) { + error = err; + goto out_vn_unlock; + } + // change to std::move once dp is a dentry_ref + fp->f_dentry = dentry_ref(dp, false); + dp = nullptr; + + error = VOP_OPEN(vp, fp); + if (error) { + vn_unlock(vp); + // Note direct delete of fp instead of fdrop(fp). fp was never + // returned so cannot be in use, and because it wasn't opened + // it cannot be close()ed. + delete fp; + return error; + } + vn_unlock(vp); + + *fpp = fp; + return 0; + +out_vn_unlock: + vn_unlock(vp); +out_drele: + if (dp) { + drele(dp); + } + return error; +} + +int +sys_close(struct file *fp) +{ + + return 0; +} + +int +sys_read(struct file *fp, const struct iovec *iov, size_t niov, + off_t offset, size_t *count) +{ + if ((fp->f_flags & FREAD) == 0) + return EBADF; + + size_t bytes = 0; + auto iovp = iov; + for (unsigned i = 0; i < niov; i++) { + if (iovp->iov_len > IOSIZE_MAX - bytes) { + return EINVAL; + } + bytes += iovp->iov_len; + iovp++; + } + + if (bytes == 0) { + *count = 0; + return 0; + } + + struct uio uio; + // Unfortunately, the current implementation of fp->read zeros the + // iov_len fields when it reads from disk, so we have to copy iov. + std::vector<iovec> copy_iov(iov, iov + niov); + uio.uio_iov = copy_iov.data(); + uio.uio_iovcnt = niov; + uio.uio_offset = offset; + uio.uio_resid = bytes; + uio.uio_rw = UIO_READ; + auto error = fp->read(&uio, (offset == -1) ? 0 : FOF_OFFSET); + *count = bytes - uio.uio_resid; + return error; +} + +int +sys_write(struct file *fp, const struct iovec *iov, size_t niov, + off_t offset, size_t *count) +{ + if ((fp->f_flags & FWRITE) == 0) + return EBADF; + + size_t bytes = 0; + auto iovp = iov; + for (unsigned i = 0; i < niov; i++) { + if (iovp->iov_len > IOSIZE_MAX - bytes) { + return EINVAL; + } + bytes += iovp->iov_len; + iovp++; + } + + if (bytes == 0) { + *count = 0; + return 0; + } + + struct uio uio; + // Unfortunately, the current implementation of fp->write zeros the + // iov_len fields when it writes to disk, so we have to copy iov. + std::vector<iovec> copy_iov(iov, iov + niov); + uio.uio_iov = copy_iov.data(); + uio.uio_iovcnt = niov; + uio.uio_offset = offset; + uio.uio_resid = bytes; + uio.uio_rw = UIO_WRITE; + auto error = fp->write(&uio, (offset == -1) ? 0 : FOF_OFFSET); + *count = bytes - uio.uio_resid; + return error; +} + +int +sys_lseek(struct file *fp, off_t off, int type, off_t *origin) +{ + struct vnode *vp; + + DPRINTF(VFSDB_SYSCALL, ("sys_seek: fp=%x off=%d type=%d\n", + (u_long)fp, (u_int)off, type)); + + if (!fp->f_dentry) { + // Linux doesn't implement lseek() on pipes, sockets, or ttys. + // In OSV, we only implement lseek() on regular files, backed by vnode + return ESPIPE; + } + + vp = fp->f_dentry->d_vnode; + int error = EINVAL; + vn_lock(vp); + switch (type) { + case SEEK_CUR: + off = fp->f_offset + off; + break; + case SEEK_END: + off = vp->v_size + off; + break; + } + if (off >= 0) { + error = VOP_SEEK(vp, fp, fp->f_offset, off); + if (!error) { + *origin = off; + fp->f_offset = off; + } + } + vn_unlock(vp); + return error; +} + +int +sys_ioctl(struct file *fp, u_long request, void *buf) +{ + int error; + + DPRINTF(VFSDB_SYSCALL, ("sys_ioctl: fp=%x request=%x\n", fp, request)); + + if ((fp->f_flags & (FREAD | FWRITE)) == 0) + return EBADF; + + error = fp->ioctl(request, buf); + + DPRINTF(VFSDB_SYSCALL, ("sys_ioctl: comp error=%d\n", error)); + return error; +} + +int +sys_fsync(struct file *fp) +{ + struct vnode *vp; + int error; + + DPRINTF(VFSDB_SYSCALL, ("sys_fsync: fp=%x\n", fp)); + + if (!fp->f_dentry) + return EINVAL; + + vp = fp->f_dentry->d_vnode; + vn_lock(vp); + error = VOP_FSYNC(vp, fp); + vn_unlock(vp); + return error; +} + +int +sys_fstat(struct file *fp, struct stat *st) +{ + int error = 0; + + DPRINTF(VFSDB_SYSCALL, ("sys_fstat: fp=%x\n", fp)); + + error = fp->stat(st); + + return error; +} + +/* + * Return 0 if directory is empty + */ +static int +check_dir_empty(char *path) +{ + int error; + struct file *fp; + struct dirent dir; + + DPRINTF(VFSDB_SYSCALL, ("check_dir_empty\n")); + + error = sys_open(path, O_RDONLY, 0, &fp); + if (error) + goto out_error; + + do { + error = sys_readdir(fp, &dir); + if (error != 0 && error != EACCES) + break; + } while (!strcmp(dir.d_name, ".") || !strcmp(dir.d_name, "..")); + + if (error == ENOENT) + error = 0; + else if (error == 0) { + // Posix specifies to return EEXIST in this case (rmdir of non-empty + // directory, but Linux actually returns ENOTEMPTY). + error = ENOTEMPTY; + } + fdrop(fp); +out_error: + return error; +} + +int +sys_readdir(struct file *fp, struct dirent *dir) +{ + struct vnode *dvp; + int error; + + DPRINTF(VFSDB_SYSCALL, ("sys_readdir: fp=%x\n", fp)); + + if (!fp->f_dentry) + return ENOTDIR; + + dvp = fp->f_dentry->d_vnode; + vn_lock(dvp); + if (dvp->v_type != VDIR) { + vn_unlock(dvp); + return ENOTDIR; + } + error = VOP_READDIR(dvp, fp, dir); + DPRINTF(VFSDB_SYSCALL, ("sys_readdir: error=%d path=%s\n", + error, dir->d_name)); + vn_unlock(dvp); + return error; +} + +int +sys_rewinddir(struct file *fp) +{ + struct vnode *dvp; + + if (!fp->f_dentry) + return ENOTDIR; + + dvp = fp->f_dentry->d_vnode; + vn_lock(dvp); + if (dvp->v_type != VDIR) { + vn_unlock(dvp); + return EBADF; + } + fp->f_offset = 0; + vn_unlock(dvp); + return 0; +} + +int +sys_seekdir(struct file *fp, long loc) +{ + struct vnode *dvp; + + if (!fp->f_dentry) + return ENOTDIR; + + dvp = fp->f_dentry->d_vnode; + vn_lock(dvp); + if (dvp->v_type != VDIR) { + vn_unlock(dvp); + return EBADF; + } + fp->f_offset = (off_t)loc; + vn_unlock(dvp); + return 0; +} + +int +sys_telldir(struct file *fp, long *loc) +{ + struct vnode *dvp; + + if (!fp->f_dentry) + return ENOTDIR; + + dvp = fp->f_dentry->d_vnode; + vn_lock(dvp); + if (dvp->v_type != VDIR) { + vn_unlock(dvp); + return EBADF; + } + *loc = (long)fp->f_offset; + vn_unlock(dvp); + return 0; +} + +int +sys_mkdir(char *path, mode_t mode) +{ + char *name; + struct dentry *dp, *ddp; + int error; + + DPRINTF(VFSDB_SYSCALL, ("sys_mkdir: path=%s mode=%d\n", path, mode)); + + error = namei(path, &dp); + if (!error) { + /* File already exists */ + drele(dp); + return EEXIST; + } + + if ((error = lookup(path, &ddp, &name)) != 0) { + /* Directory already exists */ + return error; + } + + vn_lock(ddp->d_vnode); + if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0) + goto out; + mode &= ~S_IFMT; + mode |= S_IFDIR; + + error = VOP_MKDIR(ddp->d_vnode, name, mode); + out: + vn_unlock(ddp->d_vnode); + drele(ddp); + return error; +} + +int +sys_rmdir(char *path) +{ + struct dentry *dp, *ddp; + struct vnode *vp; + int error; + char *name; + + DPRINTF(VFSDB_SYSCALL, ("sys_rmdir: path=%s\n", path)); + + if ((error = check_dir_empty(path)) != 0) + return error; + error = namei(path, &dp); + if (error) + return error; + + vp = dp->d_vnode; + vn_lock(vp); + if ((error = vn_access(vp, VWRITE)) != 0) + goto out; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + if (vp->v_flags & VROOT || vp->v_refcnt >= 2) { + error = EBUSY; + goto out; + } + if ((error = lookup(path, &ddp, &name)) != 0) + goto out; + + vn_lock(ddp->d_vnode); + error = VOP_RMDIR(ddp->d_vnode, vp, name); + vn_unlock(ddp->d_vnode); + + vn_unlock(vp); + dentry_remove(dp); + drele(ddp); + drele(dp); + return error; + + out: + vn_unlock(vp); + drele(dp); + return error; +} + +int +sys_mknod(char *path, mode_t mode) +{ + char *name; + struct dentry *dp, *ddp; + int error; + + DPRINTF(VFSDB_SYSCALL, ("sys_mknod: path=%s mode=%d\n", path, mode)); + + switch (mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFIFO: + case S_IFSOCK: + /* OK */ + break; + default: + return EINVAL; + } + + error = namei(path, &dp); + if (!error) { + drele(dp); + return EEXIST; + } + + if ((error = lookup(path, &ddp, &name)) != 0) + return error; + + vn_lock(ddp->d_vnode); + if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0) + goto out; + if (S_ISDIR(mode)) + error = VOP_MKDIR(ddp->d_vnode, name, mode); + else + error = VOP_CREATE(ddp->d_vnode, name, mode); + out: + vn_unlock(ddp->d_vnode); + drele(ddp); + return error; +} + +/* + * Returns true when @parent path could represent parent directory + * of a file or directory represented by @child path. + * + * Assumes both paths do not have trailing slashes. + */ +static bool +is_parent(const char *parent, const char *child) +{ + size_t p_len = strlen(parent); + return !strncmp(parent, child, p_len) && (parent[p_len-1] == '/' || child[p_len] == '/'); +} + +static bool +has_trailing(const char *path, char ch) +{ + size_t len = strlen(path); + return len && path[len - 1] == ch; +} + +static void +strip_trailing(char *path, char ch) +{ + size_t len = strlen(path); + + while (len && path[len - 1] == ch) + len--; + + path[len] = '\0'; +} + +int +sys_rename(char *src, char *dest) +{ + struct dentry *dp1, *dp2 = 0, *ddp1, *ddp2; + struct vnode *vp1, *vp2 = 0, *dvp1, *dvp2; + char *sname, *dname; + int error; + char root[] = "/"; + bool ts; /* trailing slash */ + + DPRINTF(VFSDB_SYSCALL, ("sys_rename: src=%s dest=%s\n", src, dest)); + + ts = false; + if (has_trailing(src, '/') == true) { + if (strlen(src) != 1) { + /* remove trailing slash iff path is none root */ + strip_trailing(src, '/'); + ts = true; + } + } + + error = lookup(src, &ddp1, &sname); + if (error != 0) { + return (error); + } + + error = namei_last_nofollow(src, ddp1, &dp1); + if (error != 0) { + drele(ddp1); + return (error); + } + + vp1 = dp1->d_vnode; + vn_lock(vp1); + + if (vp1->v_type != VDIR && ts == true) { + error = ENOTDIR; + goto err1; + } + + ts = false; + if (has_trailing(dest, '/') == true) { + if (strlen(dest) != 1) { + /* remove trailing slash iff path is none root */ + strip_trailing(dest, '/'); + ts = true; + } + } + + error = lookup(dest, &ddp2, &dname); + if (error != 0) { + goto err1; + } + + error = namei_last_nofollow(dest, ddp2, &dp2); + if (error == 0) { + /* target exists */ + + vp2 = dp2->d_vnode; + vn_lock(vp2); + + if (vp2->v_type != VDIR && vp2->v_type != VLNK) { + if (vp1->v_type == VDIR || ts == true) { + error = ENOTDIR; + goto err2; + } + } else if (vp1->v_type != VDIR && vp2->v_type == VDIR) { + error = EISDIR; + goto err2; + } + if (vp2->v_type == VDIR && check_dir_empty(dest)) { + error = EEXIST; + goto err2; + } + } else if (error == ENOENT) { + if (vp1->v_type != VDIR && ts == true) { + error = ENOTDIR; + goto err2; + } + } else { + goto err2; + } + + if (strcmp(dest, "/")) + strip_trailing(dest, '/'); + + if (strcmp(src, "/")) + strip_trailing(src, '/'); + + /* If source and dest are the same, do nothing */ + if (!strncmp(src, dest, PATH_MAX)) + goto err2; + + /* Check if target is directory of source */ + if (is_parent(src, dest)) { + error = EINVAL; + goto err2; + } + + dname = strrchr(dest, '/'); + if (dname == nullptr) { + error = ENOTDIR; + goto err2; + } + if (dname == dest) + dest = root; + + *dname = 0; + dname++; + + dvp1 = ddp1->d_vnode; + vn_lock(dvp1); + + dvp2 = ddp2->d_vnode; + vn_lock(dvp2); + + /* Source and destination directions should be writable) */ + if ((error = vn_access(dvp1, VWRITE)) != 0) + goto err3; + if ((error = vn_access(dvp2, VWRITE)) != 0) + goto err3; + + /* The source and dest must be same file system */ + if (dvp1->v_mount != dvp2->v_mount) { + error = EXDEV; + goto err3; + } + + error = VOP_RENAME(dvp1, vp1, sname, dvp2, vp2, dname); + + dentry_move(dp1, ddp2, dname); + if (dp2) + dentry_remove(dp2); + + err3: + vn_unlock(dvp2); + vn_unlock(dvp1); + err2: + if (vp2) { + vn_unlock(vp2); + drele(dp2); + } + drele(ddp2); + err1: + vn_unlock(vp1); + drele(dp1); + drele(ddp1); + return error; +} + +int +sys_symlink(const char *oldpath, const char *newpath) +{ + struct task *t = main_task; + int error; + std::unique_ptr<char []> up_op (new char[PATH_MAX]); + char *op = up_op.get(); + std::unique_ptr<char []> up_np (new char[PATH_MAX]); + char *np = up_np.get(); + struct dentry *newdp; + struct dentry *newdirdp; + char *name; + + if (oldpath == nullptr || newpath == nullptr) { + return (EFAULT); + } + + DPRINTF(VFSDB_SYSCALL, ("sys_link: oldpath=%s newpath=%s\n", + oldpath, newpath)); + + newdp = nullptr; + newdirdp = nullptr; + + error = task_conv(t, newpath, VWRITE, np); + if (error != 0) { + return (error); + } + + /* parent directory for new path must exist */ + if ((error = lookup(np, &newdirdp, &name)) != 0) { + error = ENOENT; + goto out; + } + vn_lock(newdirdp->d_vnode); + + /* newpath should not already exist */ + if (namei_last_nofollow(np, newdirdp, &newdp) == 0) { + drele(newdp); + error = EEXIST; + goto out; + } + + /* check for write access at newpath */ + if ((error = vn_access(newdirdp->d_vnode, VWRITE)) != 0) { + goto out; + } + + /* oldpath may not be const char * to VOP_SYMLINK - need to copy */ + size_t tocopy; + tocopy = strlcpy(op, oldpath, PATH_MAX); + if (tocopy >= PATH_MAX - 1) { + error = ENAMETOOLONG; + goto out; + } + error = VOP_SYMLINK(newdirdp->d_vnode, name, op); + +out: + if (newdirdp != nullptr) { + vn_unlock(newdirdp->d_vnode); + drele(newdirdp); + } + + return (error); +} + +int +sys_link(char *oldpath, char *newpath) +{ + struct dentry *olddp, *newdp, *newdirdp; + struct vnode *vp; + char *name; + int error; + + DPRINTF(VFSDB_SYSCALL, ("sys_link: oldpath=%s newpath=%s\n", + oldpath, newpath)); + + /* File from oldpath must exist */ + if ((error = namei(oldpath, &olddp)) != 0) + return error; + + vp = olddp->d_vnode; + vn_lock(vp); + + if (vp->v_type == VDIR) { + error = EPERM; + goto out; + } + + /* If newpath exists, it shouldn't be overwritten */ + if (!namei(newpath, &newdp)) { + error = EEXIST; + goto out; + } + + /* Get pointer to the parent dentry of newpath */ + if ((error = lookup(newpath, &newdirdp, &name)) != 0) + goto out; + + vn_lock(newdirdp->d_vnode); + + /* Both files must reside on the same mounted file system */ + if (olddp->d_mount != newdirdp->d_mount) { + error = EXDEV; + goto out1; + } + + /* Write access to the dir containing newpath is required */ + if ((error = vn_access(newdirdp->d_vnode, VWRITE)) != 0) + goto out1; + + /* Map newpath into dentry hash with the same vnode as oldpath */ + if (!(newdp = dentry_alloc(newdirdp, vp, newpath))) { + error = ENOMEM; + goto out1; + } + + error = VOP_LINK(newdirdp->d_vnode, vp, name); + out1: + vn_unlock(newdirdp->d_vnode); + drele(newdirdp); + out: + vn_unlock(vp); + drele(olddp); + drele(newdp); + return error; +} + +int +sys_unlink(char *path) +{ + char *name; + struct dentry *dp, *ddp; + struct vnode *vp; + int error; + + DPRINTF(VFSDB_SYSCALL, ("sys_unlink: path=%s\n", path)); + + ddp = nullptr; + dp = nullptr; + vp = nullptr; + + error = lookup(path, &ddp, &name); + if (error != 0) { + return (error); + } + + error = namei_last_nofollow(path, ddp, &dp); + if (error != 0) { + goto out; + } + + vp = dp->d_vnode; + vn_lock(vp); + if (vp->v_type == VDIR) { + // Posix specifies that we should return EPERM here, but Linux + // actually returns EISDIR. + error = EISDIR; + goto out; + } + if (vp->v_flags & VROOT) { + error = EBUSY; + goto out; + } + + vn_lock(ddp->d_vnode); + if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0) { + vn_unlock(ddp->d_vnode); + goto out; + } + error = VOP_REMOVE(ddp->d_vnode, vp, name); + vn_unlock(ddp->d_vnode); + + vn_unlock(vp); + dentry_remove(dp); + drele(ddp); + drele(dp); + return error; + out: + if (vp != nullptr) { + vn_unlock(vp); + } + + if (dp != nullptr) { + drele(dp); + } + + if (ddp != nullptr) { + drele(ddp); + } + return error; +} + +int +sys_access(char *path, int mode) +{ + struct dentry *dp; + int error, flags; + + DPRINTF(VFSDB_SYSCALL, ("sys_access: path=%s mode=%x\n", path, mode)); + + /* If F_OK is set, we return here if file is not found. */ + error = namei(path, &dp); + if (error) + return error; + + flags = 0; + if (mode & R_OK) + flags |= VREAD; + if (mode & W_OK) + flags |= VWRITE; + if (mode & X_OK) + flags |= VEXEC; + + error = vn_access(dp->d_vnode, flags); + + drele(dp); + return error; +} + +int +sys_stat(char *path, struct stat *st) +{ + DPRINTF(VFSDB_SYSCALL, ("sys_stat: path=%s\n", path)); + + try { + dentry_ref dp = namei(path); + if (!dp) { + return ENOENT; + } + return vn_stat(dp->d_vnode, st); + } catch (error e) { + return e.get(); + } +} + +int sys_lstat(char *path, struct stat *st) +{ + int error; + struct dentry *ddp; + char *name; + struct dentry *dp; + + DPRINTF(VFSDB_SYSCALL, ("sys_lstat: path=%s\n", path)); + + error = lookup(path, &ddp, &name); + if (error) { + return (error); + } + + error = namei_last_nofollow(path, ddp, &dp); + if (error) { + drele(ddp); + return error; + } + + error = vn_stat(dp->d_vnode, st); + drele(dp); + drele(ddp); + return error; +} + +int +sys_statfs(char *path, struct statfs *buf) +{ + memset(buf, 0, sizeof(*buf)); + try { + dentry_ref dp = namei(path); + if (!dp) { + return ENOENT; + } + return VFS_STATFS(dp->d_mount, buf); + } catch (error e) { + return e.get(); + } +} + +int +sys_fstatfs(struct file *fp, struct statfs *buf) +{ + struct vnode *vp; + int error = 0; + + if (!fp->f_dentry) + return EBADF; + + vp = fp->f_dentry->d_vnode; + memset(buf, 0, sizeof(*buf)); + + vn_lock(vp); + error = VFS_STATFS(vp->v_mount, buf); + vn_unlock(vp); + + return error; +} + +int +sys_truncate(char *path, off_t length) +{ + struct dentry *dp; + int error; + + error = namei(path, &dp); + if (error) + return error; + + vn_lock(dp->d_vnode); + error = VOP_TRUNCATE(dp->d_vnode, length); + vn_unlock(dp->d_vnode); + + drele(dp); + return error; +} + +int +sys_ftruncate(struct file *fp, off_t length) +{ + struct vnode *vp; + int error; + + if (!fp->f_dentry) + return EBADF; + + vp = fp->f_dentry->d_vnode; + vn_lock(vp); + error = VOP_TRUNCATE(vp, length); + vn_unlock(vp); + + return error; +} + +int +sys_fchdir(struct file *fp, char *cwd) +{ + struct vnode *dvp; + + if (!fp->f_dentry) + return EBADF; + + dvp = fp->f_dentry->d_vnode; + vn_lock(dvp); + if (dvp->v_type != VDIR) { + vn_unlock(dvp); + return EBADF; + } + strlcpy(cwd, fp->f_dentry->d_path, PATH_MAX); + vn_unlock(dvp); + return 0; +} + +int +sys_readlink(char *path, char *buf, size_t bufsize, ssize_t *size) +{ + int error; + struct dentry *ddp; + char *name; + struct dentry *dp; + struct vnode *vp; + struct iovec vec; + struct uio uio; + + *size = 0; + error = lookup(path, &ddp, &name); + if (error) { + return (error); + } + + error = namei_last_nofollow(path, ddp, &dp); + if (error) { + drele(ddp); + return (error); + } + + if (dp->d_vnode->v_type != VLNK) { + drele(dp); + drele(ddp); + return (EINVAL); + } + vec.iov_base = buf; + vec.iov_len = bufsize; + + uio.uio_iov = &vec; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_resid = bufsize; + uio.uio_rw = UIO_READ; + + vp = dp->d_vnode; + vn_lock(vp); + error = VOP_READLINK(vp, &uio); + vn_unlock(vp); + + drele(dp); + drele(ddp); + + if (error) { + return (error); + } + + *size = bufsize - uio.uio_resid; + return (0); +} + +/* + * Check the validity of the members of a struct timeval. + */ +static bool is_timeval_valid(const struct timeval *time) +{ + return (time->tv_sec >= 0) && + (time->tv_usec >= 0 && time->tv_usec < 1000000); +} + +/* + * Convert a timeval struct to a timespec one. + */ +static void convert_timeval(struct timespec &to, const struct timeval *from) +{ + if (from) { + to.tv_sec = from->tv_sec; + to.tv_nsec = from->tv_usec * 1000; // Convert microseconds to nanoseconds + } else { + clock_gettime(CLOCK_REALTIME, &to); + } +} + +int +sys_utimes(char *path, const struct timeval times[2], int flags) +{ + int error; + struct dentry *dp; + struct timespec timespec_times[2]; + + DPRINTF(VFSDB_SYSCALL, ("sys_utimes: path=%s\n", path)); + + if (times && (!is_timeval_valid(&times[0]) || !is_timeval_valid(&times[1]))) + return EINVAL; + + // Convert each element of timeval array to the timespec type + convert_timeval(timespec_times[0], times ? times + 0 : nullptr); + convert_timeval(timespec_times[1], times ? times + 1 : nullptr); + + if (flags & AT_SYMLINK_NOFOLLOW) { + struct dentry *ddp; + error = lookup(path, &ddp, nullptr); + if (error) { + return error; + } + + error = namei_last_nofollow(path, ddp, &dp); + if (ddp != nullptr) { + drele(ddp); + } + if (error) { + return error; + } + } else { + error = namei(path, &dp); + if (error) + return error; + } + + if (dp->d_mount->m_flags & MNT_RDONLY) { + error = EROFS; + } else { + error = vn_settimes(dp->d_vnode, timespec_times); + } + + drele(dp); + return error; +} + +/* + * Check the validity of members of a struct timespec + */ +static bool is_timespec_valid(const struct timespec &time) +{ + return (time.tv_sec >= 0) && + ((time.tv_nsec >= 0 && time.tv_nsec <= 999999999) || + time.tv_nsec == UTIME_NOW || + time.tv_nsec == UTIME_OMIT); +} + +void init_timespec(struct timespec &_times, const struct timespec *times) +{ + if (times == nullptr || times->tv_nsec == UTIME_NOW) { + clock_gettime(CLOCK_REALTIME, &_times); + } else { + _times.tv_sec = times->tv_sec; + _times.tv_nsec = times->tv_nsec; + } + return; +} + +int +sys_utimensat(int dirfd, const char *pathname, const struct timespec times[2], int flags) +{ + int error; + std::string ap; + struct timespec timespec_times[2]; + extern struct task *main_task; + struct dentry *dp; + + /* utimensat should return ENOENT when pathname is empty */ + if(pathname && pathname[0] == 0) + return ENOENT; + + if (flags && !(flags & AT_SYMLINK_NOFOLLOW)) + return EINVAL; + + if (times && (!is_timespec_valid(times[0]) || !is_timespec_valid(times[1]))) + return EINVAL; + + init_timespec(timespec_times[0], times ? times + 0 : nullptr); + init_timespec(timespec_times[1], times ? times + 1 : nullptr); + + if (pathname && pathname[0] == '/') { + ap = pathname; + } else if (dirfd == AT_FDCWD) { + if (!pathname) + return EFAULT; + ap = std::string(main_task->t_cwd) + "/" + pathname; + } else { + struct file *fp; + fileref f(fileref_from_fd(dirfd)); + + if (!f) + return EBADF; + + fp = f.get(); + + if(!fp->f_dentry) + return EBADF; + + if (!(fp->f_dentry->d_vnode->v_type & VDIR)) + return ENOTDIR; + + if (pathname) + ap = std::string(fp->f_dentry->d_path) + "/" + pathname; + else + ap = fp->f_dentry->d_path; + + ap = std::string(fp->f_dentry->d_mount->m_path) + "/" + ap; + } + + /* FIXME: Add support for AT_SYMLINK_NOFOLLOW */ + + error = namei(ap.c_str(), &dp); + + if (error) + return error; + + if (dp->d_mount->m_flags & MNT_RDONLY) { + error = EROFS; + } else { + if (vn_access(dp->d_vnode, VWRITE)) { + return EACCES; + } + if (times && + (times[0].tv_nsec != UTIME_NOW || times[1].tv_nsec != UTIME_NOW) && + (times[0].tv_nsec != UTIME_OMIT || times[1].tv_nsec != UTIME_OMIT) && + (!(dp->d_vnode->v_mode & ~VAPPEND))) + return EPERM; + error = vn_settimes(dp->d_vnode, timespec_times); + } + + drele(dp); + return error; +} + +int +sys_futimens(int fd, const struct timespec times[2]) +{ + struct file *fp; + + fileref f(fileref_from_fd(fd)); + if (!f) + return EBADF; + + fp = f.get(); + + if (!fp->f_dentry) + return EBADF; + + std::string pathname = fp->f_dentry->d_path; + auto error = sys_utimensat(AT_FDCWD, pathname.c_str(), times, 0); + return error; +} + +int +sys_fallocate(struct file *fp, int mode, loff_t offset, loff_t len) +{ + int error; + struct vnode *vp; + + DPRINTF(VFSDB_SYSCALL, ("sys_fallocate: fp=%x", fp)); + + if (!fp->f_dentry || !(fp->f_flags & FWRITE)) { + return EBADF; + } + + if (offset < 0 || len <= 0) { + return EINVAL; + } + + // Strange, but that's what Linux returns. + if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE)) { + return ENOTSUP; + } + + vp = fp->f_dentry->d_vnode; + vn_lock(vp); + + // NOTE: It's not detected here whether or not the device underlying + // the fs is a block device. It's up to the fs itself tell us whether + // or not fallocate is supported. See below: + if (vp->v_type != VREG && vp->v_type != VDIR) { + error = ENODEV; + goto ret; + } + + // EOPNOTSUPP here means that the underlying file system + // referred by vp doesn't support fallocate. + if (!vp->v_op->vop_fallocate) { + error = EOPNOTSUPP; + goto ret; + } + + error = VOP_FALLOCATE(vp, mode, offset, len); +ret: + vn_unlock(vp); + return error; +} + +int +sys_chmod(const char *path, mode_t mode) +{ + int error; + struct dentry *dp; + DPRINTF(VFSDB_SYSCALL, ("sys_chmod: path=%s\n", path)); + error = namei(path, &dp); + if (error) + return error; + if (dp->d_mount->m_flags & MNT_RDONLY) { + error = EROFS; + } else { + error = vn_setmode(dp->d_vnode, mode); + } + drele(dp); + return error; +} + +int +sys_fchmod(int fd, mode_t mode) +{ + fileref f(fileref_from_fd(fd)); + if (!f) + return EBADF; + // Posix is ambivalent on what fchmod() should do on an fd that does not + // refer to a real file. It suggests an implementation may (but not must) + // fail EINVAL on a pipe, can behave in an "unspecified" manner on a + // socket, and for a STREAM, it must succeed and do nothing. Linux seems + // to just do the last thing (do nothing and succeed). + if (!f->f_dentry) { + return 0; + } + if (f->f_dentry->d_mount->m_flags & MNT_RDONLY) { + return EROFS; + } else { + return vn_setmode(f->f_dentry->d_vnode, mode); + } +} diff --git a/lib/vfscore/task.c b/lib/vfscore/task.c new file mode 100644 index 00000000..7a355034 --- /dev/null +++ b/lib/vfscore/task.c @@ -0,0 +1,167 @@ +/*- + * Copyright (c) 2007, Kohsuke Ohtani All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * vfs_task.c - Routines to manage the per task data. + */ + + +#include <limits.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> + +#include <osv/prex.h> +#include "vfs.h" + +/* + * Allocate new task. + */ +int +task_alloc(struct task **pt) +{ + struct task *t; + + // FIXME: where do we free task ? + if (!(t = new task)) + return ENOMEM; + memset(t, 0, sizeof(struct task)); + strlcpy(t->t_cwd, "/", sizeof(t->t_cwd)); + + *pt = t; + return 0; +} + +/* + * Convert to full path from the cwd of task and path. + * @wd: working directory + * @path: target path + * @full: full path to be returned + */ +int +path_conv(char *wd, const char *cpath, char *full) +{ + char path[PATH_MAX]; + char *src, *tgt, *p, *end; + size_t len = 0; + + strlcpy(path, cpath, PATH_MAX); + path[PATH_MAX - 1] = '\0'; + + len = strlen(path); + if (len >= PATH_MAX) + return ENAMETOOLONG; + if (strlen(wd) + len >= PATH_MAX) + return ENAMETOOLONG; + src = path; + tgt = full; + end = src + len; + if (path[0] == '/') { + *tgt++ = *src++; + len = 1; + } else { + strlcpy(full, wd, PATH_MAX); + len = strlen(wd); + tgt += len; + if (len > 1 && path[0] != '.') { + *tgt = '/'; + tgt++; + len++; + } + } + while (*src) { + p = src; + while (*p != '/' && *p != '\0') + p++; + *p = '\0'; + if (!strcmp(src, "..")) { + if (len >= 2) { + len -= 2; + tgt -= 2; /* skip previous '/' */ + while (*tgt != '/') { + tgt--; + len--; + } + if (len == 0) { + tgt++; + len++; + } + } + } else if (!strcmp(src, ".")) { + /* Ignore "." */ + } else { + while (*src != '\0') { + *tgt++ = *src++; + len++; + } + } + if (p == end) + break; + if (len > 0 && *(tgt - 1) != '/') { + *tgt++ = '/'; + len++; + } + src = p + 1; + } + *tgt = '\0'; + + return (0); +} + +/* + * Convert to full path from the cwd of task and path. + * @t: task structure + * @path: target path + * @full: full path to be returned + * @acc: access mode + */ +int +task_conv(struct task *t, const char *cpath, int acc, char *full) +{ + int rc; + + rc = path_conv(t->t_cwd, cpath, full); + if (rc != 0) { + return (rc); + } + + /* Check if the client task has required permission */ + return (0); //sec_file_permission(t->t_taskid, full, acc); +} + +/* + * Safe copying function that checks for overflow. + */ +int vfs_dname_copy(char *dest, const char *src, size_t size) +{ + if (strlcpy(dest, src, size) >= size) { + return -1; + } + return 0; +} diff --git a/lib/vfscore/vfs.h b/lib/vfscore/vfs.h new file mode 100644 index 00000000..d86ef957 --- /dev/null +++ b/lib/vfscore/vfs.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2005-2007, Kohsuke Ohtani + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VFS_H +#define _VFS_H + +#include <sys/cdefs.h> +#include <assert.h> +#include <dirent.h> +#include <limits.h> + +#include <osv/prex.h> +#include <osv/file.h> +#include <osv/mount.h> +#include <osv/vnode.h> +#include <osv/dentry.h> +#include <osv/error.h> + +/* + * Import vnode attributes flags + */ +#include <osv/vnode_attr.h> + +/* #define DEBUG_VFS 1 */ + +/* + * Tunable parameters + */ +#define FSMAXNAMES 16 /* max length of 'file system' name */ + +#ifdef DEBUG_VFS +#include <osv/debug.h> + +extern int vfs_debug; + +#define VFSDB_CORE 0x00000001 +#define VFSDB_SYSCALL 0x00000002 +#define VFSDB_VNODE 0x00000004 +#define VFSDB_BIO 0x00000008 +#define VFSDB_CAP 0x00000010 + +#define VFSDB_FLAGS 0x00000013 + +#define DPRINTF(_m,X) if (vfs_debug & (_m)) kprintf X +#else +#define DPRINTF(_m, X) +#endif + +#define ASSERT(e) assert(e) + +#define OPEN_MAX 256 + +/* + * per task data + */ +struct task { + char t_cwd[PATH_MAX]; /* current working directory */ + struct file *t_cwdfp; /* directory for cwd */ +}; + +extern const struct vfssw vfssw[]; + +__BEGIN_DECLS +int sys_open(char *path, int flags, mode_t mode, struct file **fp); +int sys_read(struct file *fp, const struct iovec *iov, size_t niov, + off_t offset, size_t *count); +int sys_write(struct file *fp, const struct iovec *iov, size_t niov, + off_t offset, size_t *count); +int sys_lseek(struct file *fp, off_t off, int type, off_t * cur_off); +int sys_ioctl(struct file *fp, u_long request, void *buf); +int sys_fstat(struct file *fp, struct stat *st); +int sys_fstatfs(struct file *fp, struct statfs *buf); +int sys_fsync(struct file *fp); +int sys_ftruncate(struct file *fp, off_t length); + +int sys_readdir(struct file *fp, struct dirent *dirent); +int sys_rewinddir(struct file *fp); +int sys_seekdir(struct file *fp, long loc); +int sys_telldir(struct file *fp, long *loc); +int sys_fchdir(struct file *fp, char *path); + +int sys_mkdir(char *path, mode_t mode); +int sys_rmdir(char *path); +int sys_mknod(char *path, mode_t mode); +int sys_rename(char *src, char *dest); +int sys_link(char *oldpath, char *newpath); +int sys_unlink(char *path); +int sys_symlink(const char *oldpath, const char *newpath); +int sys_access(char *path, int mode); +int sys_stat(char *path, struct stat *st); +int sys_lstat(char *path, struct stat *st); +int sys_statfs(char *path, struct statfs *buf); +int sys_truncate(char *path, off_t length); +int sys_readlink(char *path, char *buf, size_t bufsize, ssize_t *size); +int sys_utimes(char *path, const struct timeval times[2], int flags); +int sys_utimensat(int dirfd, const char *pathname, + const struct timespec times[2], int flags); +int sys_futimens(int fd, const struct timespec times[2]); +int sys_fallocate(struct file *fp, int mode, loff_t offset, loff_t len); + +int sys_mount(const char *dev, const char *dir, const char *fsname, int flags, const void *data); +int sys_umount2(const char *path, int flags); +int sys_umount(const char *path); +int sys_pivot_root(const char *new_root, const char *old_put); +int sys_sync(void); +int sys_chmod(const char *path, mode_t mode); +int sys_fchmod(int fd, mode_t mode); + + +int task_alloc(struct task **pt); +int task_conv(struct task *t, const char *path, int mode, char *full); +int path_conv(char *wd, const char *cpath, char *full); + +//int sec_file_permission(task_t task, char *path, int mode); +int sec_vnode_permission(char *path); + +int namei(const char *path, struct dentry **dpp); +int namei_last_nofollow(char *path, struct dentry *ddp, struct dentry **dp); +int lookup(char *path, struct dentry **dpp, char **name); +void vnode_init(void); +void lookup_init(void); + +int vfs_findroot(const char *path, struct mount **mp, char **root); +int vfs_dname_copy(char *dest, const char *src, size_t size); + +int fs_noop(void); + +struct dentry *dentry_alloc(struct dentry *parent_dp, struct vnode *vp, const char *path); +struct dentry *dentry_lookup(struct mount *mp, char *path); +void dentry_move(struct dentry *dp, struct dentry *parent_dp, char *path); +void dentry_remove(struct dentry *dp); +void dref(struct dentry *dp); +void drele(struct dentry *dp); +void dentry_init(void); + +#ifdef DEBUG_VFS +void vnode_dump(void); +void mount_dump(void); +#endif + +__END_DECLS + +#ifdef __cplusplus + +// Convert a path to a dentry_ref. Returns an empty +// reference if not found (ENOENT) for efficiency, throws +// an error on other errors. +inline dentry_ref namei(char* path) +{ + dentry* dp; + auto err = namei(path, &dp); + if (err == ENOENT) { + return dentry_ref(); + } else if (err) { + throw make_error(err); + } else { + return dentry_ref(dp, false); + } +} + +#endif + +#endif /* !_VFS_H */ diff --git a/lib/vfscore/vnode.c b/lib/vfscore/vnode.c new file mode 100644 index 00000000..a292344f --- /dev/null +++ b/lib/vfscore/vnode.c @@ -0,0 +1,522 @@ +/* + * Copyright (c) 2005-2008, Kohsuke Ohtani + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * vfs_vnode.c - vnode service + */ + +#include <limits.h> +#include <unistd.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <sys/stat.h> + +#include <osv/prex.h> +#include <osv/vnode.h> +#include "vfs.h" + +enum vtype iftovt_tab[16] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, +}; +int vttoif_tab[10] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, + S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT +}; + +/* + * Memo: + * + * Function Ref count Lock + * ---------- --------- ---------- + * vn_lock * Lock + * vn_unlock * Unlock + * vget 1 Lock + * vput -1 Unlock + * vref +1 * + * vrele -1 * + */ + +#define VNODE_BUCKETS 32 /* size of vnode hash table */ + +/* + * vnode table. + * All active (opened) vnodes are stored on this hash table. + * They can be accessed by its path name. + */ +static LIST_HEAD(vnode_hash_head, vnode) vnode_table[VNODE_BUCKETS]; + +/* + * Global lock to access all vnodes and vnode table. + * If a vnode is already locked, there is no need to + * lock this global lock to access internal data. + */ +static mutex_t vnode_lock = MUTEX_INITIALIZER; +#define VNODE_LOCK() mutex_lock(&vnode_lock) +#define VNODE_UNLOCK() mutex_unlock(&vnode_lock) +#define VNODE_OWNED() mutex_owned(&vnode_lock) + +/* + * Get the hash value from the mount point and path name. + * XXX(hch): replace with a better hash for 64-bit pointers. + */ +static u_int +vn_hash(struct mount *mp, uint64_t ino) +{ + return (ino ^ (unsigned long)mp) & (VNODE_BUCKETS - 1); +} + +/* + * Returns locked vnode for specified mount point and path. + * vn_lock() will increment the reference count of vnode. + * + * Locking: VNODE_LOCK must be held. + */ +struct vnode * +vn_lookup(struct mount *mp, uint64_t ino) +{ + struct vnode *vp; + + assert(VNODE_OWNED()); + LIST_FOREACH(vp, &vnode_table[vn_hash(mp, ino)], v_link) { + if (vp->v_mount == mp && vp->v_ino == ino) { + vp->v_refcnt++; + mutex_lock(&vp->v_lock); + vp->v_nrlocks++; + return vp; + } + } + return nullptr; /* not found */ +} + +#ifdef DEBUG_VFS +static const char * +vn_path(struct vnode *vp) +{ + struct dentry *dp; + + if (LIST_EMPTY(&vp->v_names) == 1) { + return (" "); + } + dp = LIST_FIRST(&vp->v_names); + return (dp->d_path); +} +#endif + +/* + * Lock vnode + */ +void +vn_lock(struct vnode *vp) +{ + ASSERT(vp); + ASSERT(vp->v_refcnt > 0); + + mutex_lock(&vp->v_lock); + vp->v_nrlocks++; + DPRINTF(VFSDB_VNODE, ("vn_lock: %s\n", vn_path(vp))); +} + +/* + * Unlock vnode + */ +void +vn_unlock(struct vnode *vp) +{ + ASSERT(vp); + ASSERT(vp->v_refcnt > 0); + ASSERT(vp->v_nrlocks > 0); + + vp->v_nrlocks--; + mutex_unlock(&vp->v_lock); + DPRINTF(VFSDB_VNODE, ("vn_lock: %s\n", vn_path(vp))); +} + +/* + * Allocate new vnode for specified path. + * Increment its reference count and lock it. + * Returns 1 if vnode was found in cache; otherwise returns 0. + */ +int +vget(struct mount *mp, uint64_t ino, struct vnode **vpp) +{ + struct vnode *vp; + int error; + + *vpp = nullptr; + + DPRINTF(VFSDB_VNODE, ("vget %LLu\n", ino)); + + VNODE_LOCK(); + + vp = vn_lookup(mp, ino); + if (vp) { + VNODE_UNLOCK(); + *vpp = vp; + return 1; + } + + if (!(vp = new vnode())) { + VNODE_UNLOCK(); + return 0; + } + + LIST_INIT(&vp->v_names); + vp->v_ino = ino; + vp->v_mount = mp; + vp->v_refcnt = 1; + vp->v_op = mp->m_op->vfs_vnops; + vp->v_nrlocks = 0; + + /* + * Request to allocate fs specific data for vnode. + */ + if ((error = VFS_VGET(mp, vp)) != 0) { + VNODE_UNLOCK(); + delete vp; + return error; + } + vfs_busy(vp->v_mount); + mutex_lock(&vp->v_lock); + vp->v_nrlocks++; + + LIST_INSERT_HEAD(&vnode_table[vn_hash(mp, ino)], vp, v_link); + VNODE_UNLOCK(); + + *vpp = vp; + + return 0; +} + +/* + * Unlock vnode and decrement its reference count. + */ +void +vput(struct vnode *vp) +{ + ASSERT(vp); + ASSERT(vp->v_nrlocks > 0); + ASSERT(vp->v_refcnt > 0); + DPRINTF(VFSDB_VNODE, ("vput: ref=%d %s\n", vp->v_refcnt, vn_path(vp))); + + VNODE_LOCK(); + vp->v_refcnt--; + if (vp->v_refcnt > 0) { + VNODE_UNLOCK(); + vn_unlock(vp); + return; + } + LIST_REMOVE(vp, v_link); + VNODE_UNLOCK(); + + /* + * Deallocate fs specific vnode data + */ + if (vp->v_op->vop_inactive) + VOP_INACTIVE(vp); + vfs_unbusy(vp->v_mount); + vp->v_nrlocks--; + ASSERT(vp->v_nrlocks == 0); + mutex_unlock(&vp->v_lock); + delete vp; +} + +/* + * Increment the reference count on an active vnode. + */ +void +vref(struct vnode *vp) +{ + ASSERT(vp); + ASSERT(vp->v_refcnt > 0); /* Need vget */ + + VNODE_LOCK(); + DPRINTF(VFSDB_VNODE, ("vref: ref=%d\n", vp->v_refcnt)); + vp->v_refcnt++; + VNODE_UNLOCK(); +} + +/* + * Decrement the reference count of the vnode. + * Any code in the system which is using vnode should call vrele() + * when it is finished with the vnode. + * If count drops to zero, call inactive routine and return to freelist. + */ +void +vrele(struct vnode *vp) +{ + ASSERT(vp); + ASSERT(vp->v_refcnt > 0); + + VNODE_LOCK(); + DPRINTF(VFSDB_VNODE, ("vrele: ref=%d\n", vp->v_refcnt)); + vp->v_refcnt--; + if (vp->v_refcnt > 0) { + VNODE_UNLOCK(); + return; + } + LIST_REMOVE(vp, v_link); + VNODE_UNLOCK(); + + /* + * Deallocate fs specific vnode data + */ + VOP_INACTIVE(vp); + vfs_unbusy(vp->v_mount); + delete vp; +} + +/* + * Remove all vnode in the vnode table for unmount. + */ +void +vflush(struct mount *mp) +{ +} + +int +vn_stat(struct vnode *vp, struct stat *st) +{ + struct vattr vattr; + struct vattr *vap; + mode_t mode; + int error; + + vap = &vattr; + + memset(st, 0, sizeof(struct stat)); + + memset(vap, 0, sizeof(struct vattr)); + + error = VOP_GETATTR(vp, vap); + if (error) + return error; + + st->st_ino = (ino_t)vap->va_nodeid; + st->st_size = vap->va_size; + mode = vap->va_mode; + switch (vp->v_type) { + case VREG: + mode |= S_IFREG; + break; + case VDIR: + mode |= S_IFDIR; + break; + case VBLK: + mode |= S_IFBLK; + break; + case VCHR: + mode |= S_IFCHR; + break; + case VLNK: + mode |= S_IFLNK; + break; + case VSOCK: + mode |= S_IFSOCK; + break; + case VFIFO: + mode |= S_IFIFO; + break; + default: + return EBADF; + }; + st->st_mode = mode; + st->st_nlink = vap->va_nlink; + st->st_blksize = BSIZE; + st->st_blocks = vap->va_size / S_BLKSIZE; + st->st_uid = vap->va_uid; + st->st_gid = vap->va_gid; + st->st_dev = vap->va_fsid; + if (vp->v_type == VCHR || vp->v_type == VBLK) + st->st_rdev = vap->va_rdev; + + st->st_atim = vap->va_atime; + st->st_mtim = vap->va_mtime; + st->st_ctim = vap->va_ctime; + + return 0; +} + +/* + * Set access and modification times of the vnode + */ +int +vn_settimes(struct vnode *vp, struct timespec times[2]) +{ + struct vattr vattr; + struct vattr *vap; + int error; + + vap = &vattr; + memset(vap, 0, sizeof(struct vattr)); + + vap->va_atime = times[0]; + vap->va_mtime = times[1]; + vap->va_mask = ((times[0].tv_nsec == UTIME_OMIT) ? 0 : AT_ATIME) + | ((times[1].tv_nsec == UTIME_OMIT) ? 0 : AT_MTIME); + vn_lock(vp); + error = VOP_SETATTR(vp, vap); + vn_unlock(vp); + + return error; +} + +/* + * Set chmod permissions on the vnode. + */ +int +vn_setmode(struct vnode *vp, mode_t new_mode) +{ + struct vattr vattr; + memset(&vattr, 0, sizeof(vattr)); + vattr.va_mode = new_mode; + vattr.va_mask = AT_MODE; + vn_lock(vp); + vp->v_mode = new_mode; + int error = VOP_SETATTR(vp, &vattr); + vn_unlock(vp); + return error; +} + +/* + * Check permission on vnode pointer. + */ +int +vn_access(struct vnode *vp, int flags) +{ + int error = 0; + + if ((flags & VEXEC) && (vp->v_mode & 0111) == 0) { + error = EACCES; + goto out; + } + if ((flags & VREAD) && (vp->v_mode & 0444) == 0) { + error = EACCES; + goto out; + } + if (flags & VWRITE) { + if (vp->v_mount->m_flags & MNT_RDONLY) { + error = EROFS; + goto out; + } + if ((vp->v_mode & 0222) == 0) { + error = EACCES; + goto out; + } + } + out: + return error; +} + +#ifdef DEBUG_VFS +/* + * Dump all all vnode. + */ +void +vnode_dump(void) +{ + int i; + struct vnode *vp; + struct mount *mp; + char type[][6] = { "VNON ", "VREG ", "VDIR ", "VBLK ", "VCHR ", + "VLNK ", "VSOCK", "VFIFO" }; + + VNODE_LOCK(); + kprintf("Dump vnode\n"); + kprintf(" vnode mount type refcnt blkno path\n"); + kprintf(" -------- -------- ----- ------ -------- ------------------------------\n"); + + for (i = 0; i < VNODE_BUCKETS; i++) { + LIST_FOREACH(vp, &vnode_table[i], v_link) { + mp = vp->v_mount; + + kprintf(" %08x %08x %s %6d %8d %s%s\n", (u_long)vp, + (u_long)mp, type[vp->v_type], vp->v_refcnt, + (strlen(mp->m_path) == 1) ? "\0" : mp->m_path, + vn_path(vp)); + } + } + kprintf("\n"); + VNODE_UNLOCK(); +} +#endif + +int +vop_nullop(void) +{ + return 0; +} + +int +vop_einval(void) +{ + return EINVAL; +} + +int +vop_eperm(void) +{ + return EPERM; +} + +int +vop_erofs(void) +{ + return EROFS; +} + +/* + * vnode_init() is called once (from vfs_init) + * in initialization. + */ +void +vnode_init(void) +{ + int i; + + for (i = 0; i < VNODE_BUCKETS; i++) + LIST_INIT(&vnode_table[i]); +} + +void vn_add_name(struct vnode *vp, struct dentry *dp) +{ + vn_lock(vp); + LIST_INSERT_HEAD(&vp->v_names, dp, d_names_link); + vn_unlock(vp); +} + +void vn_del_name(struct vnode *vp, struct dentry *dp) +{ + vn_lock(vp); + LIST_REMOVE(dp, d_names_link); + vn_unlock(vp); +} + -- 2.19.2 _______________________________________________ Minios-devel mailing list Minios-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/minios-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.