|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Minios-devel] [UNIKRAFT PATCH 04/22] lib/vfscore: Initial import of OSv vfs
The code is imported as is.
Commit f1f42915a33bebe120e70af1f32c1a4d92bac780
Signed-off-by: Yuri Volchkov <yuri.volchkov@xxxxxxxxx>
---
lib/vfscore/dentry.c | 234 +++
lib/vfscore/fops.c | 189 ++
lib/vfscore/include/vfscore/dentry.h | 45 +
lib/vfscore/include/vfscore/mount.h | 171 ++
lib/vfscore/include/vfscore/prex.h | 34 +
lib/vfscore/include/vfscore/uio.h | 89 +
lib/vfscore/include/vfscore/vnode.h | 246 +++
lib/vfscore/lookup.c | 375 ++++
lib/vfscore/main.c | 2413 ++++++++++++++++++++++++++
lib/vfscore/mount.c | 491 ++++++
lib/vfscore/subr_uio.c | 73 +
lib/vfscore/syscalls.c | 1486 ++++++++++++++++
lib/vfscore/task.c | 167 ++
lib/vfscore/vfs.h | 189 ++
lib/vfscore/vnode.c | 522 ++++++
15 files changed, 6724 insertions(+)
create mode 100644 lib/vfscore/dentry.c
create mode 100644 lib/vfscore/fops.c
create mode 100644 lib/vfscore/include/vfscore/dentry.h
create mode 100644 lib/vfscore/include/vfscore/mount.h
create mode 100644 lib/vfscore/include/vfscore/prex.h
create mode 100644 lib/vfscore/include/vfscore/uio.h
create mode 100644 lib/vfscore/include/vfscore/vnode.h
create mode 100644 lib/vfscore/lookup.c
create mode 100644 lib/vfscore/main.c
create mode 100644 lib/vfscore/mount.c
create mode 100644 lib/vfscore/subr_uio.c
create mode 100644 lib/vfscore/syscalls.c
create mode 100644 lib/vfscore/task.c
create mode 100644 lib/vfscore/vfs.h
create mode 100644 lib/vfscore/vnode.c
diff --git a/lib/vfscore/dentry.c b/lib/vfscore/dentry.c
new file mode 100644
index 00000000..facd9eaa
--- /dev/null
+++ b/lib/vfscore/dentry.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <sys/param.h>
+
+#include <osv/dentry.h>
+#include <osv/vnode.h>
+#include "vfs.h"
+
+#define DENTRY_BUCKETS 32
+
+static LIST_HEAD(dentry_hash_head, dentry) dentry_hash_table[DENTRY_BUCKETS];
+static LIST_HEAD(fake, dentry) fake;
+static mutex dentry_hash_lock;
+
+/*
+ * Get the hash value from the mount point and path name.
+ * XXX: replace with a better hash for 64-bit pointers.
+ */
+static u_int
+dentry_hash(struct mount *mp, const char *path)
+{
+ u_int val = 0;
+
+ if (path) {
+ while (*path) {
+ val = ((val << 5) + val) + *path++;
+ }
+ }
+ return (val ^ (unsigned long) mp) & (DENTRY_BUCKETS - 1);
+}
+
+
+struct dentry *
+dentry_alloc(struct dentry *parent_dp, struct vnode *vp, const char *path)
+{
+ struct mount *mp = vp->v_mount;
+ struct dentry *dp = (dentry*)calloc(sizeof(*dp), 1);
+
+ if (!dp) {
+ return nullptr;
+ }
+
+ vref(vp);
+
+ dp->d_refcnt = 1;
+ dp->d_vnode = vp;
+ dp->d_mount = mp;
+ dp->d_path = strdup(path);
+ LIST_INIT(&dp->d_children);
+
+ if (parent_dp) {
+ dref(parent_dp);
+ WITH_LOCK(parent_dp->d_lock) {
+ // Insert dp into its parent's children list.
+ LIST_INSERT_HEAD(&parent_dp->d_children, dp, d_children_link);
+ }
+ }
+ dp->d_parent = parent_dp;
+
+ vn_add_name(vp, dp);
+
+ mutex_lock(&dentry_hash_lock);
+ LIST_INSERT_HEAD(&dentry_hash_table[dentry_hash(mp, path)], dp, d_link);
+ mutex_unlock(&dentry_hash_lock);
+ return dp;
+};
+
+struct dentry *
+dentry_lookup(struct mount *mp, char *path)
+{
+ struct dentry *dp;
+
+ mutex_lock(&dentry_hash_lock);
+ LIST_FOREACH(dp, &dentry_hash_table[dentry_hash(mp, path)], d_link) {
+ if (dp->d_mount == mp && !strncmp(dp->d_path, path, PATH_MAX)) {
+ dp->d_refcnt++;
+ mutex_unlock(&dentry_hash_lock);
+ return dp;
+ }
+ }
+ mutex_unlock(&dentry_hash_lock);
+ return nullptr; /* not found */
+}
+
+static void dentry_children_remove(struct dentry *dp)
+{
+ struct dentry *entry = nullptr;
+
+ WITH_LOCK(dp->d_lock) {
+ LIST_FOREACH(entry, &dp->d_children, d_children_link) {
+ ASSERT(entry);
+ ASSERT(entry->d_refcnt > 0);
+ LIST_REMOVE(entry, d_link);
+ }
+ }
+}
+
+void
+dentry_move(struct dentry *dp, struct dentry *parent_dp, char *path)
+{
+ struct dentry *old_pdp = dp->d_parent;
+ char *old_path = dp->d_path;
+
+ if (old_pdp) {
+ WITH_LOCK(old_pdp->d_lock) {
+ // Remove dp from its old parent's children list.
+ LIST_REMOVE(dp, d_children_link);
+ }
+ }
+
+ if (parent_dp) {
+ dref(parent_dp);
+ WITH_LOCK(parent_dp->d_lock) {
+ // Insert dp into its new parent's children list.
+ LIST_INSERT_HEAD(&parent_dp->d_children, dp, d_children_link);
+ }
+ }
+
+ WITH_LOCK(dentry_hash_lock) {
+ // Remove all dp's child dentries from the hashtable.
+ dentry_children_remove(dp);
+ // Remove dp with outdated hash info from the hashtable.
+ LIST_REMOVE(dp, d_link);
+ // Update dp.
+ dp->d_path = strdup(path);
+ dp->d_parent = parent_dp;
+ // Insert dp updated hash info into the hashtable.
+ LIST_INSERT_HEAD(&dentry_hash_table[dentry_hash(dp->d_mount, path)],
+ dp, d_link);
+ }
+
+ if (old_pdp) {
+ drele(old_pdp);
+ }
+
+ free(old_path);
+}
+
+void
+dentry_remove(struct dentry *dp)
+{
+ mutex_lock(&dentry_hash_lock);
+ LIST_REMOVE(dp, d_link);
+ /* put it on a fake list for drele() to work*/
+ LIST_INSERT_HEAD(&fake, dp, d_link);
+ mutex_unlock(&dentry_hash_lock);
+}
+
+void
+dref(struct dentry *dp)
+{
+ ASSERT(dp);
+ ASSERT(dp->d_refcnt > 0);
+
+ mutex_lock(&dentry_hash_lock);
+ dp->d_refcnt++;
+ mutex_unlock(&dentry_hash_lock);
+}
+
+void
+drele(struct dentry *dp)
+{
+ ASSERT(dp);
+ ASSERT(dp->d_refcnt > 0);
+
+ mutex_lock(&dentry_hash_lock);
+ if (--dp->d_refcnt) {
+ mutex_unlock(&dentry_hash_lock);
+ return;
+ }
+ LIST_REMOVE(dp, d_link);
+ vn_del_name(dp->d_vnode, dp);
+
+ mutex_unlock(&dentry_hash_lock);
+
+ if (dp->d_parent) {
+ WITH_LOCK(dp->d_parent->d_lock) {
+ // Remove dp from its parent's children list.
+ LIST_REMOVE(dp, d_children_link);
+ }
+ drele(dp->d_parent);
+ }
+
+ vrele(dp->d_vnode);
+
+ free(dp->d_path);
+ free(dp);
+}
+
+void
+dentry_init(void)
+{
+ int i;
+
+ for (i = 0; i < DENTRY_BUCKETS; i++) {
+ LIST_INIT(&dentry_hash_table[i]);
+ }
+}
diff --git a/lib/vfscore/fops.c b/lib/vfscore/fops.c
new file mode 100644
index 00000000..3a8f98b4
--- /dev/null
+++ b/lib/vfscore/fops.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <osv/file.h>
+#include <osv/poll.h>
+#include <fs/vfs/vfs.h>
+#include <osv/vfs_file.hh>
+#include <osv/mmu.hh>
+#include <osv/pagecache.hh>
+
+vfs_file::vfs_file(unsigned flags)
+ : file(flags, DTYPE_VNODE)
+{
+}
+
+int vfs_file::close()
+{
+ auto fp = this;
+ struct vnode *vp = fp->f_dentry->d_vnode;
+ int error;
+
+ vn_lock(vp);
+ error = VOP_CLOSE(vp, fp);
+ vn_unlock(vp);
+
+ if (error)
+ return error;
+
+ fp->f_dentry.reset();
+ return 0;
+}
+
+int vfs_file::read(struct uio *uio, int flags)
+{
+ auto fp = this;
+ struct vnode *vp = fp->f_dentry->d_vnode;
+ int error;
+ size_t count;
+ ssize_t bytes;
+
+ bytes = uio->uio_resid;
+
+ vn_lock(vp);
+ if ((flags & FOF_OFFSET) == 0)
+ uio->uio_offset = fp->f_offset;
+
+ error = VOP_READ(vp, fp, uio, 0);
+ if (!error) {
+ count = bytes - uio->uio_resid;
+ if ((flags & FOF_OFFSET) == 0)
+ fp->f_offset += count;
+ }
+ vn_unlock(vp);
+
+ return error;
+}
+
+
+int vfs_file::write(struct uio *uio, int flags)
+{
+ auto fp = this;
+ struct vnode *vp = fp->f_dentry->d_vnode;
+ int ioflags = 0;
+ int error;
+ size_t count;
+ ssize_t bytes;
+
+ bytes = uio->uio_resid;
+
+ vn_lock(vp);
+
+ if (fp->f_flags & O_APPEND)
+ ioflags |= IO_APPEND;
+ if (fp->f_flags & (O_DSYNC|O_SYNC))
+ ioflags |= IO_SYNC;
+
+ if ((flags & FOF_OFFSET) == 0)
+ uio->uio_offset = fp->f_offset;
+
+ error = VOP_WRITE(vp, uio, ioflags);
+ if (!error) {
+ count = bytes - uio->uio_resid;
+ if ((flags & FOF_OFFSET) == 0)
+ fp->f_offset += count;
+ }
+
+ vn_unlock(vp);
+ return error;
+}
+
+int vfs_file::ioctl(u_long com, void *data)
+{
+ auto fp = this;
+ struct vnode *vp = fp->f_dentry->d_vnode;
+ int error;
+
+ vn_lock(vp);
+ error = VOP_IOCTL(vp, fp, com, data);
+ vn_unlock(vp);
+
+ return error;
+}
+
+int vfs_file::stat(struct stat *st)
+{
+ auto fp = this;
+ struct vnode *vp = fp->f_dentry->d_vnode;
+ int error;
+
+ vn_lock(vp);
+ error = vn_stat(vp, st);
+ vn_unlock(vp);
+
+ return error;
+}
+
+int vfs_file::poll(int events)
+{
+ return poll_no_poll(events);
+}
+
+int vfs_file::truncate(off_t len)
+{
+ // somehow this is handled outside file ops
+ abort();
+}
+
+int vfs_file::chmod(mode_t mode)
+{
+ // somehow this is handled outside file ops
+ abort();
+}
+
+bool vfs_file::map_page(uintptr_t off, mmu::hw_ptep<0> ptep,
mmu::pt_element<0> pte, bool write, bool shared)
+{
+ return pagecache::get(this, off, ptep, pte, write, shared);
+}
+
+bool vfs_file::put_page(void *addr, uintptr_t off, mmu::hw_ptep<0> ptep)
+{
+ return pagecache::release(this, addr, off, ptep);
+}
+
+void vfs_file::sync(off_t start, off_t end)
+{
+ pagecache::sync(this, start, end);
+}
+
+// Locking: VOP_CACHE will call into the filesystem, and that can trigger an
+// eviction that will hold the mmu-side lock that protects the mappings
+// Always follow that order. We however can't just get rid of the mmu-side
lock,
+// because not all invalidations will be synchronous.
+int vfs_file::get_arcbuf(void* key, off_t offset)
+{
+ struct vnode *vp = f_dentry->d_vnode;
+
+ iovec io[1];
+
+ io[0].iov_base = key;
+ uio data;
+ data.uio_iov = io;
+ data.uio_iovcnt = 1;
+ data.uio_offset = offset;
+ data.uio_resid = mmu::page_size;
+ data.uio_rw = UIO_READ;
+
+ vn_lock(vp);
+ assert(VOP_CACHE(vp, this, &data) == 0);
+ vn_unlock(vp);
+
+ return (data.uio_resid != 0) ? -1 : 0;
+}
+
+std::unique_ptr<mmu::file_vma> vfs_file::mmap(addr_range range, unsigned
flags, unsigned perm, off_t offset)
+{
+ auto fp = this;
+ struct vnode *vp = fp->f_dentry->d_vnode;
+ if (!vp->v_op->vop_cache || (vp->v_size < (off_t)mmu::page_size)) {
+ return mmu::default_file_mmap(this, range, flags, perm, offset);
+ }
+ return mmu::map_file_mmap(this, range, flags, perm, offset);
+}
diff --git a/lib/vfscore/include/vfscore/dentry.h
b/lib/vfscore/include/vfscore/dentry.h
new file mode 100644
index 00000000..a2545af8
--- /dev/null
+++ b/lib/vfscore/include/vfscore/dentry.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#ifndef _OSV_DENTRY_H
+#define _OSV_DENTRY_H 1
+
+#include <osv/mutex.h>
+#include <bsd/sys/sys/queue.h>
+
+struct vnode;
+
+struct dentry {
+ LIST_ENTRY(dentry) d_link; /* link for hash list */
+ int d_refcnt; /* reference count */
+ char *d_path; /* pointer to path in fs */
+ struct vnode *d_vnode;
+ struct mount *d_mount;
+ struct dentry *d_parent; /* pointer to parent */
+ LIST_ENTRY(dentry) d_names_link; /* link fo vnode::d_names */
+ mutex_t d_lock;
+ LIST_HEAD(, dentry) d_children;
+ LIST_ENTRY(dentry) d_children_link;
+};
+
+#ifdef __cplusplus
+
+#include <boost/intrusive_ptr.hpp>
+
+using dentry_ref = boost::intrusive_ptr<dentry>;
+
+extern "C" {
+ void dref(struct dentry* dp);
+ void drele(struct dentry* dp);
+};
+
+inline void intrusive_ptr_add_ref(dentry* dp) { dref(dp); }
+inline void intrusive_ptr_release(dentry* dp) { drele(dp); }
+
+#endif
+
+#endif /* _OSV_DENTRY_H */
diff --git a/lib/vfscore/include/vfscore/mount.h
b/lib/vfscore/include/vfscore/mount.h
new file mode 100644
index 00000000..7268d8ce
--- /dev/null
+++ b/lib/vfscore/include/vfscore/mount.h
@@ -0,0 +1,171 @@
+/*-
+ * Copyright (c) 1989, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)mount.h 8.21 (Berkeley) 5/20/95
+ */
+
+#ifndef _SYS_MOUNT_H_
+#define _SYS_MOUNT_H_
+
+#include <sys/cdefs.h>
+#include <sys/statfs.h>
+#include <osv/vnode.h>
+#include <bsd/sys/sys/queue.h>
+
+__BEGIN_DECLS
+
+#ifdef _KERNEL
+
+/*
+ * Mount data
+ */
+struct mount {
+ struct vfsops *m_op; /* pointer to vfs operation */
+ int m_flags; /* mount flag */
+ int m_count; /* reference count */
+ char m_path[PATH_MAX]; /* mounted path */
+ char m_special[PATH_MAX]; /* resource */
+ struct device *m_dev; /* mounted device */
+ struct dentry *m_root; /* root vnode */
+ struct dentry *m_covered; /* vnode covered on parent fs */
+ void *m_data; /* private data for fs */
+ fsid_t m_fsid; /* id that uniquely identifies the fs */
+};
+
+#endif
+
+/*
+ * Mount flags.
+ */
+#define MNT_RDONLY 0x00000001 /* read only filesystem */
+#define MNT_SYNCHRONOUS 0x00000002 /* file system written
synchronously */
+#define MNT_NOEXEC 0x00000004 /* can't exec from filesystem */
+#define MNT_NOSUID 0x00000008 /* don't honor setuid bits on
fs */
+#define MNT_NODEV 0x00000010 /* don't interpret special
files */
+#define MNT_UNION 0x00000020 /* union with underlying
filesystem */
+#define MNT_ASYNC 0x00000040 /* file system written
asynchronously */
+
+/*
+ * Unmount flags.
+ */
+#define MNT_FORCE 0x00000001 /* forced unmount */
+
+/*
+ * exported mount flags.
+ */
+#define MNT_EXRDONLY 0x00000080 /* exported read only */
+#define MNT_EXPORTED 0x00000100 /* file system is exported */
+#define MNT_DEFEXPORTED 0x00000200 /* exported to the world */
+#define MNT_EXPORTANON 0x00000400 /* use anon uid mapping for
everyone */
+#define MNT_EXKERB 0x00000800 /* exported with Kerberos uid
mapping */
+
+/*
+ * Flags set by internal operations.
+ */
+#define MNT_LOCAL 0x00001000 /* filesystem is stored locally
*/
+#define MNT_QUOTA 0x00002000 /* quotas are enabled on
filesystem */
+#define MNT_ROOTFS 0x00004000 /* identifies the root
filesystem */
+
+/*
+ * Mask of flags that are visible to statfs()
+ */
+#define MNT_VISFLAGMASK 0x0000ffff
+
+#ifdef _KERNEL
+
+/*
+ * Filesystem type switch table.
+ */
+struct vfssw {
+ const char *vs_name; /* name of file system */
+ int (*vs_init)(void); /* initialize routine */
+ struct vfsops *vs_op; /* pointer to vfs operation */
+};
+
+/*
+ * Operations supported on virtual file system.
+ */
+struct vfsops {
+ int (*vfs_mount) (struct mount *, const char *, int, const void
*);
+ int (*vfs_unmount) (struct mount *, int flags);
+ int (*vfs_sync) (struct mount *);
+ int (*vfs_vget) (struct mount *, struct vnode *);
+ int (*vfs_statfs) (struct mount *, struct statfs *);
+ struct vnops *vfs_vnops;
+};
+
+typedef int (*vfsop_mount_t)(struct mount *, const char *, int, const void *);
+typedef int (*vfsop_umount_t)(struct mount *, int flags);
+typedef int (*vfsop_sync_t)(struct mount *);
+typedef int (*vfsop_vget_t)(struct mount *, struct vnode *);
+typedef int (*vfsop_statfs_t)(struct mount *, struct statfs *);
+
+/*
+ * VFS interface
+ */
+#define VFS_MOUNT(MP, DEV, FL, DAT) ((MP)->m_op->vfs_mount)(MP, DEV, FL, DAT)
+#define VFS_UNMOUNT(MP, FL) ((MP)->m_op->vfs_unmount)(MP, FL)
+#define VFS_SYNC(MP) ((MP)->m_op->vfs_sync)(MP)
+#define VFS_VGET(MP, VP) ((MP)->m_op->vfs_vget)(MP, VP)
+#define VFS_STATFS(MP, SFP) ((MP)->m_op->vfs_statfs)(MP, SFP)
+
+#define VFS_NULL ((void *)vfs_null)
+
+int vfs_nullop(void);
+int vfs_einval(void);
+
+void vfs_busy(struct mount *mp);
+void vfs_unbusy(struct mount *mp);
+
+void release_mp_dentries(struct mount *mp);
+
+#endif
+
+__END_DECLS
+
+#ifdef __cplusplus
+
+#include <vector>
+#include <string>
+
+namespace osv {
+
+struct mount_desc {
+ std::string special;
+ std::string path;
+ std::string type;
+ std::string options;
+};
+
+std::vector<mount_desc> current_mounts();
+
+}
+
+#endif
+
+#endif /* !_SYS_MOUNT_H_ */
diff --git a/lib/vfscore/include/vfscore/prex.h
b/lib/vfscore/include/vfscore/prex.h
new file mode 100644
index 00000000..43650340
--- /dev/null
+++ b/lib/vfscore/include/vfscore/prex.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#ifndef _OSV_PREX_H
+#define _OSV_PREX_H 1
+
+
+#include <unistd.h>
+#include <osv/fcntl.h>
+
+__BEGIN_DECLS
+
+#define __packed __attribute__((__packed__))
+
+#define BSIZE 512 /* size of secondary block (bytes) */
+
+#define DO_RDWR 0x2
+
+#define PAGE_SIZE 4096
+#define PAGE_MASK (PAGE_SIZE-1)
+#define round_page(x) (((x) + PAGE_MASK) & ~PAGE_MASK)
+
+size_t strlcat(char *dst, const char *src, size_t siz);
+size_t strlcpy(char *dst, const char *src, size_t siz);
+
+void sys_panic(const char *);
+
+__END_DECLS
+
+#endif /* _OSV_PREX_H */
diff --git a/lib/vfscore/include/vfscore/uio.h
b/lib/vfscore/include/vfscore/uio.h
new file mode 100644
index 00000000..696b01cf
--- /dev/null
+++ b/lib/vfscore/include/vfscore/uio.h
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 1982, 1986, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uio.h 8.5 (Berkeley) 2/22/94
+ * $FreeBSD$
+ */
+
+#ifndef _UIO_H_
+#define _UIO_H_
+
+#include <sys/cdefs.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <limits.h>
+
+__BEGIN_DECLS
+
+enum uio_rw { UIO_READ, UIO_WRITE };
+
+/*
+ * Safe default to prevent possible overflows in user code, otherwise could
+ * be SSIZE_T_MAX.
+ */
+#define IOSIZE_MAX INT_MAX
+
+#define UIO_MAXIOV 1024
+
+#define UIO_SYSSPACE 0
+
+struct uio {
+ struct iovec *uio_iov; /* scatter/gather list */
+ int uio_iovcnt; /* length of scatter/gather list */
+ off_t uio_offset; /* offset in target object */
+ ssize_t uio_resid; /* remaining bytes to process */
+ enum uio_rw uio_rw; /* operation */
+};
+
+int uiomove(void *cp, int n, struct uio *uio);
+
+__END_DECLS
+
+#ifdef __cplusplus
+
+template <typename F>
+static inline void linearize_uio_write(struct uio *uio, int ioflag, F f)
+{
+ while (uio->uio_resid > 0) {
+ struct iovec *iov = uio->uio_iov;
+
+ if (iov->iov_len) {
+ f(reinterpret_cast<const char *>(iov->iov_base),
+ iov->iov_len);
+ }
+
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ uio->uio_resid -= iov->iov_len;
+ uio->uio_offset += iov->iov_len;
+ }
+}
+
+#endif
+
+#endif /* !_UIO_H_ */
diff --git a/lib/vfscore/include/vfscore/vnode.h
b/lib/vfscore/include/vfscore/vnode.h
new file mode 100644
index 00000000..e35aa830
--- /dev/null
+++ b/lib/vfscore/include/vfscore/vnode.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _SYS_VNODE_H_
+#define _SYS_VNODE_H_
+
+#ifdef _KERNEL
+
+#include <sys/cdefs.h>
+#include <sys/stat.h>
+#include <osv/prex.h>
+#include <osv/uio.h>
+#include <osv/mutex.h>
+#include "file.h"
+#include "dirent.h"
+
+__BEGIN_DECLS
+
+struct vfsops;
+struct vnops;
+struct vnode;
+struct file;
+
+/*
+ * Vnode types.
+ */
+enum vtype {
+ VNON, /* no type */
+ VREG, /* regular file */
+ VDIR, /* directory */
+ VBLK, /* block device */
+ VCHR, /* character device */
+ VLNK, /* symbolic link */
+ VSOCK, /* socks */
+ VFIFO, /* FIFO */
+ VBAD
+};
+
+/*
+ * Reading or writing any of these items requires holding the
+ * appropriate lock.
+ */
+struct vnode {
+ uint64_t v_ino; /* inode number */
+ LIST_ENTRY(vnode) v_link; /* link for hash list */
+ struct mount *v_mount; /* mounted vfs pointer */
+ struct vnops *v_op; /* vnode operations */
+ int v_refcnt; /* reference count */
+ int v_type; /* vnode type */
+ int v_flags; /* vnode flag */
+ mode_t v_mode; /* file mode */
+ off_t v_size; /* file size */
+ mutex_t v_lock; /* lock for this vnode */
+ LIST_HEAD(, dentry) v_names; /* directory entries pointing at this */
+ int v_nrlocks; /* lock count (for debug) */
+ void *v_data; /* private data for fs */
+};
+
+/* flags for vnode */
+#define VROOT 0x0001 /* root of its file system */
+#define VISTTY 0x0002 /* device is tty */
+#define VPROTDEV 0x0004 /* protected device */
+
+/*
+ * Vnode attribute
+ */
+struct vattr {
+ unsigned int va_mask;
+ enum vtype va_type; /* vnode type */
+ mode_t va_mode; /* file access mode */
+ nlink_t va_nlink;
+ uid_t va_uid;
+ gid_t va_gid;
+ dev_t va_fsid; /* id of the underlying filesystem */
+ ino_t va_nodeid;
+ struct timespec va_atime;
+ struct timespec va_mtime;
+ struct timespec va_ctime;
+ dev_t va_rdev;
+ uint64_t va_nblocks;
+ off_t va_size;
+};
+
+/*
+ * Modes.
+ */
+#define VAPPEND 00010
+#define VREAD 00004 /* read, write, execute permissions */
+#define VWRITE 00002
+#define VEXEC 00001
+
+#define IO_APPEND 0x0001
+#define IO_SYNC 0x0002
+
+/*
+ * ARC actions
+ */
+#define ARC_ACTION_QUERY 0
+#define ARC_ACTION_HOLD 1
+#define ARC_ACTION_RELEASE 2
+
+typedef int (*vnop_open_t) (struct file *);
+typedef int (*vnop_close_t) (struct vnode *, struct file *);
+typedef int (*vnop_read_t) (struct vnode *, struct file *, struct
uio *, int);
+typedef int (*vnop_write_t) (struct vnode *, struct uio *, int);
+typedef int (*vnop_seek_t) (struct vnode *, struct file *, off_t,
off_t);
+typedef int (*vnop_ioctl_t) (struct vnode *, struct file *, u_long,
void *);
+typedef int (*vnop_fsync_t) (struct vnode *, struct file *);
+typedef int (*vnop_readdir_t) (struct vnode *, struct file *, struct
dirent *);
+typedef int (*vnop_lookup_t) (struct vnode *, char *, struct vnode
**);
+typedef int (*vnop_create_t) (struct vnode *, char *, mode_t);
+typedef int (*vnop_remove_t) (struct vnode *, struct vnode *, char
*);
+typedef int (*vnop_rename_t) (struct vnode *, struct vnode *, char *,
+ struct vnode *, struct vnode *, char *);
+typedef int (*vnop_mkdir_t) (struct vnode *, char *, mode_t);
+typedef int (*vnop_rmdir_t) (struct vnode *, struct vnode *, char
*);
+typedef int (*vnop_getattr_t) (struct vnode *, struct vattr *);
+typedef int (*vnop_setattr_t) (struct vnode *, struct vattr *);
+typedef int (*vnop_inactive_t) (struct vnode *);
+typedef int (*vnop_truncate_t) (struct vnode *, off_t);
+typedef int (*vnop_link_t) (struct vnode *, struct vnode *, char
*);
+typedef int (*vnop_cache_t) (struct vnode *, struct file *, struct uio *);
+typedef int (*vnop_fallocate_t) (struct vnode *, int, loff_t, loff_t);
+typedef int (*vnop_readlink_t) (struct vnode *, struct uio *);
+typedef int (*vnop_symlink_t) (struct vnode *, char *, char *);
+
+/*
+ * vnode operations
+ */
+struct vnops {
+ vnop_open_t vop_open;
+ vnop_close_t vop_close;
+ vnop_read_t vop_read;
+ vnop_write_t vop_write;
+ vnop_seek_t vop_seek;
+ vnop_ioctl_t vop_ioctl;
+ vnop_fsync_t vop_fsync;
+ vnop_readdir_t vop_readdir;
+ vnop_lookup_t vop_lookup;
+ vnop_create_t vop_create;
+ vnop_remove_t vop_remove;
+ vnop_rename_t vop_rename;
+ vnop_mkdir_t vop_mkdir;
+ vnop_rmdir_t vop_rmdir;
+ vnop_getattr_t vop_getattr;
+ vnop_setattr_t vop_setattr;
+ vnop_inactive_t vop_inactive;
+ vnop_truncate_t vop_truncate;
+ vnop_link_t vop_link;
+ vnop_cache_t vop_cache;
+ vnop_fallocate_t vop_fallocate;
+ vnop_readlink_t vop_readlink;
+ vnop_symlink_t vop_symlink;
+};
+
+/*
+ * vnode interface
+ */
+#define VOP_OPEN(VP, FP) ((VP)->v_op->vop_open)(FP)
+#define VOP_CLOSE(VP, FP) ((VP)->v_op->vop_close)(VP, FP)
+#define VOP_READ(VP, FP, U, F) ((VP)->v_op->vop_read)(VP, FP, U, F)
+#define VOP_CACHE(VP, FP, U) ((VP)->v_op->vop_cache)(VP, FP, U)
+#define VOP_WRITE(VP, U, F) ((VP)->v_op->vop_write)(VP, U, F)
+#define VOP_SEEK(VP, FP, OLD, NEW) ((VP)->v_op->vop_seek)(VP, FP, OLD, NEW)
+#define VOP_IOCTL(VP, FP, C, A) ((VP)->v_op->vop_ioctl)(VP, FP, C, A)
+#define VOP_FSYNC(VP, FP) ((VP)->v_op->vop_fsync)(VP, FP)
+#define VOP_READDIR(VP, FP, DIR) ((VP)->v_op->vop_readdir)(VP, FP, DIR)
+#define VOP_LOOKUP(DVP, N, VP) ((DVP)->v_op->vop_lookup)(DVP, N, VP)
+#define VOP_CREATE(DVP, N, M) ((DVP)->v_op->vop_create)(DVP, N, M)
+#define VOP_REMOVE(DVP, VP, N) ((DVP)->v_op->vop_remove)(DVP, VP, N)
+#define VOP_RENAME(DVP1, VP1, N1, DVP2, VP2, N2) \
+ ((DVP1)->v_op->vop_rename)(DVP1, VP1, N1, DVP2, VP2,
N2)
+#define VOP_MKDIR(DVP, N, M) ((DVP)->v_op->vop_mkdir)(DVP, N, M)
+#define VOP_RMDIR(DVP, VP, N) ((DVP)->v_op->vop_rmdir)(DVP, VP, N)
+#define VOP_GETATTR(VP, VAP) ((VP)->v_op->vop_getattr)(VP, VAP)
+#define VOP_SETATTR(VP, VAP) ((VP)->v_op->vop_setattr)(VP, VAP)
+#define VOP_INACTIVE(VP) ((VP)->v_op->vop_inactive)(VP)
+#define VOP_TRUNCATE(VP, N) ((VP)->v_op->vop_truncate)(VP, N)
+#define VOP_LINK(DVP, SVP, N) ((DVP)->v_op->vop_link)(DVP, SVP, N)
+#define VOP_FALLOCATE(VP, M, OFF, LEN) ((VP)->v_op->vop_fallocate)(VP, M, OFF,
LEN)
+#define VOP_READLINK(VP, U) ((VP)->v_op->vop_readlink)(VP, U)
+#define VOP_SYMLINK(DVP, OP, NP) ((DVP)->v_op->vop_symlink)(DVP, OP, NP)
+
+int vop_nullop(void);
+int vop_einval(void);
+int vop_eperm(void);
+int vop_erofs(void);
+struct vnode *vn_lookup(struct mount *, uint64_t);
+void vn_lock(struct vnode *);
+void vn_unlock(struct vnode *);
+int vn_stat(struct vnode *, struct stat *);
+int vn_settimes(struct vnode *, struct timespec[2]);
+int vn_setmode(struct vnode *, mode_t mode);
+int vn_access(struct vnode *, int);
+int vget(struct mount *, uint64_t ino, struct vnode **vpp);
+void vput(struct vnode *);
+void vref(struct vnode *);
+void vrele(struct vnode *);
+void vflush(struct mount *);
+void vn_add_name(struct vnode *, struct dentry *);
+void vn_del_name(struct vnode *, struct dentry *);
+
+extern enum vtype iftovt_tab[];
+extern int vttoif_tab[];
+#define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12])
+#define VTTOIF(indx) (vttoif_tab[(int)(indx)])
+#define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode))
+
+#define VATTR_NULL(vp) (*(vp) = (vattr_t){})
+
+static inline void vnode_pager_setsize(struct vnode *vp, off_t size)
+{
+ vp->v_size = size;
+}
+
+__END_DECLS
+
+#endif
+
+#endif /* !_SYS_VNODE_H_ */
diff --git a/lib/vfscore/lookup.c b/lib/vfscore/lookup.c
new file mode 100644
index 00000000..ad03fe25
--- /dev/null
+++ b/lib/vfscore/lookup.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/param.h>
+
+#include <osv/dentry.h>
+#include <osv/vnode.h>
+#include "vfs.h"
+
+static ssize_t
+read_link(struct vnode *vp, char *buf, size_t bufsz, ssize_t *sz)
+{
+ struct iovec iov = {buf, bufsz};
+ struct uio uio = {&iov, 1, 0, (ssize_t) bufsz, UIO_READ};
+ int rc;
+
+ *sz = 0;
+ vn_lock(vp);
+ rc = VOP_READLINK(vp, &uio);
+ vn_unlock(vp);
+
+ if (rc != 0) {
+ return (rc);
+ }
+
+ *sz = bufsz - uio.uio_resid;
+ return (0);
+}
+
+int
+namei_follow_link(struct dentry *dp, char *node, char *name, char *fp, size_t
mountpoint_len)
+{
+ std::unique_ptr<char []> link (new char[PATH_MAX]);
+ std::unique_ptr<char []> t (new char[PATH_MAX]);
+ char *lp;
+ int error;
+ ssize_t sz;
+ char *p;
+ int c;
+
+ lp = link.get();
+ error = read_link(dp->d_vnode, lp, PATH_MAX, &sz);
+ if (error != 0) {
+ return (error);
+ }
+ lp[sz] = 0;
+
+ p = fp + mountpoint_len + strlen(node);
+ c = strlen(node) - strlen(name) - 1;
+ node[c] = 0;
+
+ if (lp[0] == '/') {
+ strlcat(lp, p, PATH_MAX);
+ strlcpy(fp, lp, PATH_MAX);
+ } else {
+ strlcpy(t.get(), p, PATH_MAX);
+ strlcpy(node, fp, mountpoint_len + c + 1);
+ path_conv(node, lp, fp);
+ strlcat(fp, t.get(), PATH_MAX);
+ }
+ node[0] = 0;
+ name[0] = 0;
+ return (0);
+}
+/*
+ * Convert a pathname into a pointer to a dentry
+ *
+ * @path: full path name.
+ * @dpp: dentry to be returned.
+ */
+int
+namei(const char *path, struct dentry **dpp)
+{
+ char *p;
+ char node[PATH_MAX];
+ char name[PATH_MAX];
+ std::unique_ptr<char []> fp (new char [PATH_MAX]);
+ std::unique_ptr<char []> t (new char [PATH_MAX]);
+ struct mount *mp;
+ struct dentry *dp, *ddp;
+ struct vnode *dvp, *vp;
+ int error, i;
+ int links_followed;
+ bool need_continue;
+
+ DPRINTF(VFSDB_VNODE, ("namei: path=%s\n", path));
+
+ links_followed = 0;
+ strlcpy(fp.get(), path, PATH_MAX);
+
+ do {
+ need_continue = false;
+ /*
+ * Convert a full path name to its mount point and
+ * the local node in the file system.
+ */
+ if (vfs_findroot(fp.get(), &mp, &p)) {
+ return ENOTDIR;
+ }
+ int mountpoint_len = p - fp.get() - 1;
+ strlcpy(node, "/", sizeof(node));
+ strlcat(node, p, sizeof(node));
+ dp = dentry_lookup(mp, node);
+ if (dp) {
+ /* vnode is already active. */
+ *dpp = dp;
+ return 0;
+ }
+ /*
+ * Find target vnode, started from root directory.
+ * This is done to attach the fs specific data to
+ * the target vnode.
+ */
+ ddp = mp->m_root;
+ if (!ddp) {
+ sys_panic("VFS: no root");
+ }
+ dref(ddp);
+
+ node[0] = '\0';
+
+ while (*p != '\0') {
+ /*
+ * Get lower directory/file name.
+ */
+ while (*p == '/') {
+ p++;
+ }
+
+ if (*p == '\0') {
+ break;
+ }
+
+ for (i = 0; i < PATH_MAX; i++) {
+ if (*p == '\0' || *p == '/') {
+ break;
+ }
+ name[i] = *p++;
+ }
+ name[i] = '\0';
+
+ /*
+ * Get a vnode for the target.
+ */
+ strlcat(node, "/", sizeof(node));
+ strlcat(node, name, sizeof(node));
+ dvp = ddp->d_vnode;
+ vn_lock(dvp);
+ dp = dentry_lookup(mp, node);
+ if (dp == nullptr) {
+ /* Find a vnode in this directory. */
+ error = VOP_LOOKUP(dvp, name, &vp);
+ if (error) {
+ vn_unlock(dvp);
+ drele(ddp);
+ return error;
+ }
+
+ dp = dentry_alloc(ddp, vp, node);
+ vput(vp);
+
+ if (!dp) {
+ vn_unlock(dvp);
+ drele(ddp);
+ return ENOMEM;
+ }
+ }
+ vn_unlock(dvp);
+ drele(ddp);
+ ddp = dp;
+
+ if (dp->d_vnode->v_type == VLNK) {
+ error = namei_follow_link(dp, node, name, fp.get(),
mountpoint_len);
+ if (error) {
+ drele(dp);
+ return (error);
+ }
+
+ drele(dp);
+
+ p = fp.get();
+ dp = nullptr;
+ ddp = nullptr;
+ vp = nullptr;
+ dvp = nullptr;
+ name[0] = 0;
+ node[0] = 0;
+
+ if (++links_followed >= MAXSYMLINKS) {
+ return (ELOOP);
+ }
+ need_continue = true;
+ break;
+ }
+
+ if (*p == '/' && ddp->d_vnode->v_type != VDIR) {
+ drele(ddp);
+ return ENOTDIR;
+ }
+ }
+ } while (need_continue == true);
+
+ *dpp = dp;
+ return 0;
+}
+
+/*
+ * Convert last component in the path to pointer to dentry
+ *
+ * @path: full path name
+ * @ddp : pointer to dentry of parent
+ * @dpp : dentry to be returned
+ */
+int
+namei_last_nofollow(char *path, struct dentry *ddp, struct dentry **dpp)
+{
+ char *name;
+ int error;
+ struct mount *mp;
+ char *p;
+ struct dentry *dp;
+ struct vnode *dvp;
+ struct vnode *vp;
+ std::unique_ptr<char []> node (new char[PATH_MAX]);
+
+ dvp = nullptr;
+
+ if (path[0] != '/') {
+ return (ENOTDIR);
+ }
+
+ name = strrchr(path, '/');
+ if (name == nullptr) {
+ return (ENOENT);
+ }
+ name++;
+
+ error = vfs_findroot(path, &mp, &p);
+ if (error != 0) {
+ return (ENOTDIR);
+ }
+
+ strlcpy(node.get(), "/", PATH_MAX);
+ strlcat(node.get(), p, PATH_MAX);
+
+ // We want to treat things like /tmp/ the same as /tmp. Best way to do that
+ // is to ignore the last character, except when we're stating the root.
+ auto l = strlen(node.get()) - 1;
+ if (l && node.get()[l] == '/') {
+ node.get()[l] = '\0';
+ }
+
+ dvp = ddp->d_vnode;
+ vn_lock(dvp);
+ dp = dentry_lookup(mp, node.get());
+ if (dp == nullptr) {
+ error = VOP_LOOKUP(dvp, name, &vp);
+ if (error != 0) {
+ goto out;
+ }
+
+ dp = dentry_alloc(ddp, vp, node.get());
+ vput(vp);
+
+ if (dp == nullptr) {
+ error = ENOMEM;
+ goto out;
+ }
+ }
+
+ *dpp = dp;
+ error = 0;
+out:
+ if (dvp != nullptr) {
+ vn_unlock(dvp);
+ }
+ return (error);
+}
+
+/*
+ * Search a pathname.
+ * This is a very central but not so complicated routine. ;-P
+ *
+ * @path: full path.
+ * @dpp: pointer to dentry for directory.
+ * @name: if non-null, pointer to file name in path.
+ *
+ * This routine returns a locked directory vnode and file name.
+ */
+int
+lookup(char *path, struct dentry **dpp, char **name)
+{
+ char buf[PATH_MAX];
+ char root[] = "/";
+ char *file, *dir;
+ struct dentry *dp;
+ int error;
+
+ DPRINTF(VFSDB_VNODE, ("lookup: path=%s\n", path));
+
+ /*
+ * Get the path for directory.
+ */
+ strlcpy(buf, path, sizeof(buf));
+ file = strrchr(buf, '/');
+ if (!buf[0]) {
+ return ENOTDIR;
+ }
+ if (file == buf) {
+ dir = root;
+ } else {
+ *file = '\0';
+ dir = buf;
+ }
+ /*
+ * Get the vnode for directory
+ */
+ if ((error = namei(dir, &dp)) != 0) {
+ return error;
+ }
+ if (dp->d_vnode->v_type != VDIR) {
+ drele(dp);
+ return ENOTDIR;
+ }
+
+ *dpp = dp;
+
+ if (name) {
+ /*
+ * Get the file name
+ */
+ *name = strrchr(path, '/') + 1;
+ }
+ return 0;
+}
+
+/*
+ * vnode_init() is called once (from vfs_init)
+ * in initialization.
+ */
+void
+lookup_init(void)
+{
+ dentry_init();
+}
diff --git a/lib/vfscore/main.c b/lib/vfscore/main.c
new file mode 100644
index 00000000..cd141117
--- /dev/null
+++ b/lib/vfscore/main.c
@@ -0,0 +1,2413 @@
+/*
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/sendfile.h>
+
+#include <limits.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <signal.h>
+#define open __open_variadic
+#define fcntl __fcntl_variadic
+#include <fcntl.h>
+#undef open
+#undef fcntl
+
+#include <osv/prex.h>
+#include <osv/vnode.h>
+#include <osv/stubbing.hh>
+#include <osv/ioctl.h>
+#include <osv/trace.hh>
+#include <osv/run.hh>
+#include <drivers/console.hh>
+
+#include "vfs.h"
+
+#include "libc/internal/libc.h"
+
+#include <algorithm>
+#include <unordered_map>
+
+#include <sys/file.h>
+
+#include "fs/fs.hh"
+#include "libc/libc.hh"
+
+#include <mntent.h>
+#include <sys/mman.h>
+
+#include <osv/clock.hh>
+#include <api/utime.h>
+#include <chrono>
+
+using namespace std;
+
+
+#ifdef DEBUG_VFS
+int vfs_debug = VFSDB_FLAGS;
+#endif
+
+std::atomic<mode_t> global_umask{S_IWGRP | S_IWOTH};
+
+static inline mode_t apply_umask(mode_t mode)
+{
+ return mode & ~global_umask.load(std::memory_order_relaxed);
+}
+
+TRACEPOINT(trace_vfs_open, "\"%s\" 0x%x 0%0o", const char*, int, mode_t);
+TRACEPOINT(trace_vfs_open_ret, "%d", int);
+TRACEPOINT(trace_vfs_open_err, "%d", int);
+
+struct task *main_task; /* we only have a single process */
+
+extern "C"
+int open(const char *pathname, int flags, ...)
+{
+ mode_t mode = 0;
+ if (flags & O_CREAT) {
+ va_list ap;
+ va_start(ap, flags);
+ mode = apply_umask(va_arg(ap, mode_t));
+ va_end(ap);
+ }
+
+ trace_vfs_open(pathname, flags, mode);
+
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ struct file *fp;
+ int fd, error;
+ int acc;
+
+ acc = 0;
+ switch (flags & O_ACCMODE) {
+ case O_RDONLY:
+ acc = VREAD;
+ break;
+ case O_WRONLY:
+ acc = VWRITE;
+ break;
+ case O_RDWR:
+ acc = VREAD | VWRITE;
+ break;
+ }
+
+ error = task_conv(t, pathname, acc, path);
+ if (error)
+ goto out_errno;
+
+ error = sys_open(path, flags, mode, &fp);
+ if (error)
+ goto out_errno;
+
+ error = fdalloc(fp, &fd);
+ if (error)
+ goto out_fput;
+ fdrop(fp);
+ trace_vfs_open_ret(fd);
+ return fd;
+
+ out_fput:
+ fdrop(fp);
+ out_errno:
+ errno = error;
+ trace_vfs_open_err(error);
+ return -1;
+}
+
+LFS64(open);
+
+int openat(int dirfd, const char *pathname, int flags, ...)
+{
+ mode_t mode = 0;
+ if (flags & O_CREAT) {
+ va_list ap;
+ va_start(ap, flags);
+ mode = apply_umask(va_arg(ap, mode_t));
+ va_end(ap);
+ }
+
+ if (pathname[0] == '/' || dirfd == AT_FDCWD) {
+ return open(pathname, flags, mode);
+ }
+
+ struct file *fp;
+ int error = fget(dirfd, &fp);
+ if (error) {
+ errno = error;
+ return -1;
+ }
+
+ struct vnode *vp = fp->f_dentry->d_vnode;
+ vn_lock(vp);
+
+ std::unique_ptr<char []> up (new char[PATH_MAX]);
+ char *p = up.get();
+
+ /* build absolute path */
+ strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX);
+ strlcat(p, fp->f_dentry->d_path, PATH_MAX);
+ strlcat(p, "/", PATH_MAX);
+ strlcat(p, pathname, PATH_MAX);
+
+ error = open(p, flags, mode);
+
+ vn_unlock(vp);
+ fdrop(fp);
+
+ return error;
+}
+LFS64(openat);
+
+// open() has an optional third argument, "mode", which is only needed in
+// some cases (when the O_CREAT mode is used). As a safety feature, recent
+// versions of Glibc add a feature where open() with two arguments is replaced
+// by a call to __open_2(), which verifies it isn't called with O_CREATE.
+extern "C" int __open_2(const char *pathname, int flags)
+{
+ assert(!(flags & O_CREAT));
+ return open(pathname, flags, 0);
+}
+
+extern "C" int __open64_2(const char *file, int flags)
+{
+ if (flags & O_CREAT) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ return open64(file, flags);
+}
+
+int creat(const char *pathname, mode_t mode)
+{
+ return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode);
+}
+LFS64(creat);
+
+TRACEPOINT(trace_vfs_close, "%d", int);
+TRACEPOINT(trace_vfs_close_ret, "");
+TRACEPOINT(trace_vfs_close_err, "%d", int);
+
+int close(int fd)
+{
+ int error;
+
+ trace_vfs_close(fd);
+ error = fdclose(fd);
+ if (error)
+ goto out_errno;
+
+ trace_vfs_close_ret();
+ return 0;
+
+ out_errno:
+ trace_vfs_close_err(error);
+ errno = error;
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_mknod, "\"%s\" 0%0o 0x%x", const char*, mode_t, dev_t);
+TRACEPOINT(trace_vfs_mknod_ret, "");
+TRACEPOINT(trace_vfs_mknod_err, "%d", int);
+
+
+extern "C"
+int __xmknod(int ver, const char *pathname, mode_t mode, dev_t *dev)
+{
+ assert(ver == 0); // On x86-64 Linux, _MKNOD_VER_LINUX is 0.
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int error;
+
+ trace_vfs_mknod(pathname, mode, *dev);
+ if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+ goto out_errno;
+
+ error = sys_mknod(path, mode);
+ if (error)
+ goto out_errno;
+
+ trace_vfs_mknod_ret();
+ return 0;
+
+ out_errno:
+ trace_vfs_mknod_err(error);
+ errno = error;
+ return -1;
+}
+
+int mknod(const char *pathname, mode_t mode, dev_t dev)
+{
+ return __xmknod(0, pathname, mode, &dev);
+}
+
+
+TRACEPOINT(trace_vfs_lseek, "%d 0x%x %d", int, off_t, int);
+TRACEPOINT(trace_vfs_lseek_ret, "0x%x", off_t);
+TRACEPOINT(trace_vfs_lseek_err, "%d", int);
+
+off_t lseek(int fd, off_t offset, int whence)
+{
+ struct file *fp;
+ off_t org;
+ int error;
+
+ trace_vfs_lseek(fd, offset, whence);
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = sys_lseek(fp, offset, whence, &org);
+ fdrop(fp);
+
+ if (error)
+ goto out_errno;
+ trace_vfs_lseek_ret(org);
+ return org;
+
+ out_errno:
+ trace_vfs_lseek_err(error);
+ errno = error;
+ return -1;
+}
+
+LFS64(lseek);
+
+TRACEPOINT(trace_vfs_pread, "%d %p 0x%x 0x%x", int, void*, size_t, off_t);
+TRACEPOINT(trace_vfs_pread_ret, "0x%x", ssize_t);
+TRACEPOINT(trace_vfs_pread_err, "%d", int);
+
+// In BSD's internal implementation of read() and write() code, for example
+// sosend_generic(), a partial read or write returns both an EWOULDBLOCK error
+// *and* a non-zero number of written bytes. In that case, we need to zero the
+// error, so the system call appear a successful partial read/write.
+// In FreeBSD, dofilewrite() and dofileread() (sys_generic.c) do this too.
+static inline bool has_error(int error, int bytes)
+{
+ return error && (
+ (bytes == 0) ||
+ (error != EWOULDBLOCK && error != EINTR && error != ERESTART));
+}
+
+
+ssize_t pread(int fd, void *buf, size_t count, off_t offset)
+{
+ trace_vfs_pread(fd, buf, count, offset);
+ struct iovec iov = {
+ .iov_base = buf,
+ .iov_len = count,
+ };
+ struct file *fp;
+ size_t bytes;
+ int error;
+
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = sys_read(fp, &iov, 1, offset, &bytes);
+ fdrop(fp);
+
+ if (has_error(error, bytes))
+ goto out_errno;
+ trace_vfs_pread_ret(bytes);
+ return bytes;
+
+ out_errno:
+ trace_vfs_pread_err(error);
+ errno = error;
+ return -1;
+}
+
+LFS64(pread);
+
+ssize_t read(int fd, void *buf, size_t count)
+{
+ return pread(fd, buf, count, -1);
+}
+
+TRACEPOINT(trace_vfs_pwrite, "%d %p 0x%x 0x%x", int, const void*, size_t,
off_t);
+TRACEPOINT(trace_vfs_pwrite_ret, "0x%x", ssize_t);
+TRACEPOINT(trace_vfs_pwrite_err, "%d", int);
+
+ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+ trace_vfs_pwrite(fd, buf, count, offset);
+ struct iovec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = count,
+ };
+ struct file *fp;
+ size_t bytes;
+ int error;
+
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = sys_write(fp, &iov, 1, offset, &bytes);
+ fdrop(fp);
+
+ if (has_error(error, bytes))
+ goto out_errno;
+ trace_vfs_pwrite_ret(bytes);
+ return bytes;
+
+ out_errno:
+ trace_vfs_pwrite_err(error);
+ errno = error;
+ return -1;
+}
+
+LFS64(pwrite);
+
+ssize_t write(int fd, const void *buf, size_t count)
+{
+ return pwrite(fd, buf, count, -1);
+}
+
+ssize_t preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+ struct file *fp;
+ size_t bytes;
+ int error;
+
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = sys_read(fp, iov, iovcnt, offset, &bytes);
+ fdrop(fp);
+
+ if (has_error(error, bytes))
+ goto out_errno;
+ return bytes;
+
+ out_errno:
+ errno = error;
+ return -1;
+}
+
+LFS64(preadv);
+
+ssize_t readv(int fd, const struct iovec *iov, int iovcnt)
+{
+ return preadv(fd, iov, iovcnt, -1);
+}
+
+TRACEPOINT(trace_vfs_pwritev, "%d %p 0x%x 0x%x", int, const struct iovec*,
int, off_t);
+TRACEPOINT(trace_vfs_pwritev_ret, "0x%x", ssize_t);
+TRACEPOINT(trace_vfs_pwritev_err, "%d", int);
+
+ssize_t pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+ struct file *fp;
+ size_t bytes;
+ int error;
+
+ trace_vfs_pwritev(fd, iov, iovcnt, offset);
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = sys_write(fp, iov, iovcnt, offset, &bytes);
+ fdrop(fp);
+
+ if (has_error(error, bytes))
+ goto out_errno;
+ trace_vfs_pwritev_ret(bytes);
+ return bytes;
+
+ out_errno:
+ trace_vfs_pwritev_err(error);
+ errno = error;
+ return -1;
+}
+LFS64(pwritev);
+
+ssize_t writev(int fd, const struct iovec *iov, int iovcnt)
+{
+ return pwritev(fd, iov, iovcnt, -1);
+}
+
+TRACEPOINT(trace_vfs_ioctl, "%d 0x%x", int, unsigned long);
+TRACEPOINT(trace_vfs_ioctl_ret, "");
+TRACEPOINT(trace_vfs_ioctl_err, "%d", int);
+
+int ioctl(int fd, unsigned long int request, ...)
+{
+ struct file *fp;
+ int error;
+ va_list ap;
+ void* arg;
+
+ trace_vfs_ioctl(fd, request);
+ /* glibc ABI provides a variadic prototype for ioctl so we need to agree
+ * with it, since we now include sys/ioctl.h
+ * read the first argument and pass it to sys_ioctl() */
+ va_start(ap, request);
+ arg = va_arg(ap, void*);
+ va_end(ap);
+
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = sys_ioctl(fp, request, arg);
+ fdrop(fp);
+
+ if (error)
+ goto out_errno;
+ trace_vfs_ioctl_ret();
+ return 0;
+
+ out_errno:
+ trace_vfs_ioctl_err(error);
+ errno = error;
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_fsync, "%d", int);
+TRACEPOINT(trace_vfs_fsync_ret, "");
+TRACEPOINT(trace_vfs_fsync_err, "%d", int);
+
+int fsync(int fd)
+{
+ struct file *fp;
+ int error;
+
+ trace_vfs_fsync(fd);
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = sys_fsync(fp);
+ fdrop(fp);
+
+ if (error)
+ goto out_errno;
+ trace_vfs_fsync_ret();
+ return 0;
+
+ out_errno:
+ trace_vfs_fsync_err(error);
+ errno = error;
+ return -1;
+}
+
+int fdatasync(int fd)
+{
+ // TODO: See if we can do less than fsync().
+ return fsync(fd);
+}
+
+TRACEPOINT(trace_vfs_fstat, "%d %p", int, struct stat*);
+TRACEPOINT(trace_vfs_fstat_ret, "");
+TRACEPOINT(trace_vfs_fstat_err, "%d", int);
+
+extern "C"
+int __fxstat(int ver, int fd, struct stat *st)
+{
+ struct file *fp;
+ int error;
+
+ trace_vfs_fstat(fd, st);
+
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = sys_fstat(fp, st);
+ fdrop(fp);
+
+ if (error)
+ goto out_errno;
+ trace_vfs_fstat_ret();
+ return 0;
+
+ out_errno:
+ trace_vfs_fstat_err(error);
+ errno = error;
+ return -1;
+}
+
+LFS64(__fxstat);
+
+extern "C"
+int fstat(int fd, struct stat *st)
+{
+ return __fxstat(1, fd, st);
+}
+
+LFS64(fstat);
+
+extern "C"
+int __fxstatat(int ver, int dirfd, const char *pathname, struct stat *st,
+ int flags)
+{
+ if (flags & AT_SYMLINK_NOFOLLOW) {
+ UNIMPLEMENTED("fstatat() with AT_SYMLINK_NOFOLLOW");
+ }
+
+ if (pathname[0] == '/' || dirfd == AT_FDCWD) {
+ return stat(pathname, st);
+ }
+ // If AT_EMPTY_PATH and pathname is an empty string, fstatat() operates on
+ // dirfd itself, and in that case it doesn't have to be a directory.
+ if ((flags & AT_EMPTY_PATH) && !pathname[0]) {
+ return fstat(dirfd, st);
+ }
+
+ struct file *fp;
+ int error = fget(dirfd, &fp);
+ if (error) {
+ errno = error;
+ return -1;
+ }
+
+ struct vnode *vp = fp->f_dentry->d_vnode;
+ vn_lock(vp);
+
+ std::unique_ptr<char []> up (new char[PATH_MAX]);
+ char *p = up.get();
+ /* build absolute path */
+ strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX);
+ strlcat(p, fp->f_dentry->d_path, PATH_MAX);
+ strlcat(p, "/", PATH_MAX);
+ strlcat(p, pathname, PATH_MAX);
+
+ error = stat(p, st);
+
+ vn_unlock(vp);
+ fdrop(fp);
+
+ return error;
+}
+
+LFS64(__fxstatat);
+
+extern "C"
+int fstatat(int dirfd, const char *path, struct stat *st, int flags)
+{
+ return __fxstatat(1, dirfd, path, st, flags);
+}
+
+LFS64(fstatat);
+
+extern "C" int flock(int fd, int operation)
+{
+ if (!fileref_from_fd(fd)) {
+ return libc_error(EBADF);
+ }
+
+ switch (operation) {
+ case LOCK_SH:
+ case LOCK_SH | LOCK_NB:
+ case LOCK_EX:
+ case LOCK_EX | LOCK_NB:
+ case LOCK_UN:
+ break;
+ default:
+ return libc_error(EINVAL);
+ }
+
+ return 0;
+}
+
+TRACEPOINT(trace_vfs_readdir, "%d %p", int, dirent*);
+TRACEPOINT(trace_vfs_readdir_ret, "");
+TRACEPOINT(trace_vfs_readdir_err, "%d", int);
+
+struct __dirstream
+{
+ int fd;
+};
+
+DIR *opendir(const char *path)
+{
+ DIR *dir = new DIR;
+
+ if (!dir)
+ return libc_error_ptr<DIR>(ENOMEM);
+
+ dir->fd = open(path, O_RDONLY);
+ if (dir->fd < 0) {
+ delete dir;
+ return nullptr;
+ }
+ return dir;
+}
+
+DIR *fdopendir(int fd)
+{
+ DIR *dir;
+ struct stat st;
+ if (fstat(fd, &st) < 0) {
+ return nullptr;
+ }
+ if (!S_ISDIR(st.st_mode)) {
+ errno = ENOTDIR;
+ return nullptr;
+ }
+ dir = new DIR;
+ dir->fd = fd;
+ return dir;
+
+}
+
+int dirfd(DIR *dirp)
+{
+ if (!dirp) {
+ return libc_error(EINVAL);
+ }
+
+ return dirp->fd;
+}
+
+int closedir(DIR *dir)
+{
+ close(dir->fd);
+ delete dir;
+ return 0;
+}
+
+struct dirent *readdir(DIR *dir)
+{
+ static __thread struct dirent entry, *result;
+ int ret;
+
+ ret = readdir_r(dir, &entry, &result);
+ if (ret)
+ return libc_error_ptr<struct dirent>(ret);
+
+ errno = 0;
+ return result;
+}
+
+int readdir_r(DIR *dir, struct dirent *entry, struct dirent **result)
+{
+ int error;
+ struct file *fp;
+
+ trace_vfs_readdir(dir->fd, entry);
+ error = fget(dir->fd, &fp);
+ if (error) {
+ trace_vfs_readdir_err(error);
+ } else {
+ error = sys_readdir(fp, entry);
+ fdrop(fp);
+ if (error) {
+ trace_vfs_readdir_err(error);
+ } else {
+ trace_vfs_readdir_ret();
+ }
+ }
+ // Our dirent has (like Linux) a d_reclen field, but a constant size.
+ entry->d_reclen = sizeof(*entry);
+
+ if (error) {
+ *result = nullptr;
+ } else {
+ *result = entry;
+ }
+ return error == ENOENT ? 0 : error;
+}
+
+// FIXME: in 64bit dirent64 and dirent are identical, so it's safe to alias
+#undef readdir64_r
+extern "C" int readdir64_r(DIR *dir, struct dirent64 *entry,
+ struct dirent64 **result)
+ __attribute__((alias("readdir_r")));
+
+#undef readdir64
+extern "C" struct dirent *readdir64(DIR *dir)
__attribute__((alias("readdir")));
+
+void rewinddir(DIR *dirp)
+{
+ struct file *fp;
+
+ auto error = fget(dirp->fd, &fp);
+ if (error) {
+ // POSIX specifies that what rewinddir() does in the case of error
+ // is undefined...
+ return;
+ }
+
+ sys_rewinddir(fp);
+ // Again, error code from sys_rewinddir() is ignored.
+ fdrop(fp);
+}
+
+long telldir(DIR *dirp)
+{
+ struct file *fp;
+ int error = fget(dirp->fd, &fp);
+ if (error) {
+ return libc_error(error);
+ }
+
+ long loc;
+ error = sys_telldir(fp, &loc);
+ fdrop(fp);
+ if (error) {
+ return libc_error(error);
+ }
+ return loc;
+}
+
+void seekdir(DIR *dirp, long loc)
+{
+ struct file *fp;
+ int error = fget(dirp->fd, &fp);
+ if (error) {
+ // POSIX specifies seekdir() cannot return errors.
+ return;
+ }
+ sys_seekdir(fp, loc);
+ // Again, error code from sys_seekdir() is ignored.
+ fdrop(fp);
+}
+
+TRACEPOINT(trace_vfs_mkdir, "\"%s\" 0%0o", const char*, mode_t);
+TRACEPOINT(trace_vfs_mkdir_ret, "");
+TRACEPOINT(trace_vfs_mkdir_err, "%d", int);
+
+int
+mkdir(const char *pathname, mode_t mode)
+{
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int error;
+
+ mode = apply_umask(mode);
+
+ trace_vfs_mkdir(pathname, mode);
+ if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+ goto out_errno;
+
+ error = sys_mkdir(path, mode);
+ if (error)
+ goto out_errno;
+ trace_vfs_mkdir_ret();
+ return 0;
+ out_errno:
+ trace_vfs_mkdir_err(error);
+ errno = error;
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_rmdir, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_rmdir_ret, "");
+TRACEPOINT(trace_vfs_rmdir_err, "%d", int);
+
+int rmdir(const char *pathname)
+{
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int error;
+
+ trace_vfs_rmdir(pathname);
+ error = ENOENT;
+ if (pathname == nullptr)
+ goto out_errno;
+ if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+ goto out_errno;
+
+ error = sys_rmdir(path);
+ if (error)
+ goto out_errno;
+ trace_vfs_rmdir_ret();
+ return 0;
+ out_errno:
+ trace_vfs_rmdir_err(error);
+ errno = error;
+ return -1;
+}
+
+static void
+get_last_component(const char *path, char *dst)
+{
+ int pos = strlen(path) - 1;
+
+ while (pos >= 0 && path[pos] == '/')
+ pos--;
+
+ int component_end = pos;
+
+ while (pos >= 0 && path[pos] != '/')
+ pos--;
+
+ int component_start = pos + 1;
+
+ int len = component_end - component_start + 1;
+ memcpy(dst, path + component_start, len);
+ dst[len] = 0;
+}
+
+static bool null_or_empty(const char *str)
+{
+ return str == nullptr || *str == '\0';
+}
+
+TRACEPOINT(trace_vfs_rename, "\"%s\" \"%s\"", const char*, const char*);
+TRACEPOINT(trace_vfs_rename_ret, "");
+TRACEPOINT(trace_vfs_rename_err, "%d", int);
+
+int rename(const char *oldpath, const char *newpath)
+{
+ trace_vfs_rename(oldpath, newpath);
+ struct task *t = main_task;
+ char src[PATH_MAX];
+ char dest[PATH_MAX];
+ int error;
+
+ error = ENOENT;
+ if (null_or_empty(oldpath) || null_or_empty(newpath))
+ goto out_errno;
+
+ get_last_component(oldpath, src);
+ if (!strcmp(src, ".") || !strcmp(src, "..")) {
+ error = EINVAL;
+ goto out_errno;
+ }
+
+ get_last_component(newpath, dest);
+ if (!strcmp(dest, ".") || !strcmp(dest, "..")) {
+ error = EINVAL;
+ goto out_errno;
+ }
+
+ if ((error = task_conv(t, oldpath, VREAD, src)) != 0)
+ goto out_errno;
+
+ if ((error = task_conv(t, newpath, VWRITE, dest)) != 0)
+ goto out_errno;
+
+ error = sys_rename(src, dest);
+ if (error)
+ goto out_errno;
+ trace_vfs_rename_ret();
+ return 0;
+ out_errno:
+ trace_vfs_rename_err(error);
+ errno = error;
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_chdir, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_chdir_ret, "");
+TRACEPOINT(trace_vfs_chdir_err, "%d", int);
+
+static int replace_cwd(struct task *t, struct file *new_cwdfp,
+ std::function<int (void)> chdir_func)
+{
+ struct file *old = nullptr;
+
+ if (!t) {
+ return 0;
+ }
+
+ if (t->t_cwdfp) {
+ old = t->t_cwdfp;
+ }
+
+ /* Do the actual chdir operation here */
+ int error = chdir_func();
+
+ t->t_cwdfp = new_cwdfp;
+ if (old) {
+ fdrop(old);
+ }
+
+ return error;
+}
+
+int chdir(const char *pathname)
+{
+ trace_vfs_chdir(pathname);
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ struct file *fp;
+ int error;
+
+ error = ENOENT;
+ if (pathname == nullptr)
+ goto out_errno;
+
+ if ((error = task_conv(t, pathname, VREAD, path)) != 0)
+ goto out_errno;
+
+ /* Check if directory exits */
+ error = sys_open(path, O_DIRECTORY, 0, &fp);
+ if (error) {
+ goto out_errno;
+ }
+
+ replace_cwd(t, fp, [&]() { strlcpy(t->t_cwd, path, sizeof(t->t_cwd));
return 0; });
+
+ trace_vfs_chdir_ret();
+ return 0;
+ out_errno:
+ errno = error;
+ trace_vfs_chdir_err(errno);
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_fchdir, "%d", int);
+TRACEPOINT(trace_vfs_fchdir_ret, "");
+TRACEPOINT(trace_vfs_fchdir_err, "%d", int);
+
+int fchdir(int fd)
+{
+ trace_vfs_fchdir(fd);
+ struct task *t = main_task;
+ struct file *fp;
+ int error;
+
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = replace_cwd(t, fp, [&]() { return sys_fchdir(fp, t->t_cwd); });
+ if (error) {
+ fdrop(fp);
+ goto out_errno;
+ }
+
+ trace_vfs_fchdir_ret();
+ return 0;
+
+ out_errno:
+ trace_vfs_fchdir_err(error);
+ errno = error;
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_link, "\"%s\" \"%s\"", const char*, const char*);
+TRACEPOINT(trace_vfs_link_ret, "");
+TRACEPOINT(trace_vfs_link_err, "%d", int);
+
+int link(const char *oldpath, const char *newpath)
+{
+ struct task *t = main_task;
+ char path1[PATH_MAX];
+ char path2[PATH_MAX];
+ int error;
+
+ trace_vfs_link(oldpath, newpath);
+
+ error = ENOENT;
+ if (oldpath == nullptr || newpath == nullptr)
+ goto out_errno;
+ if ((error = task_conv(t, oldpath, VWRITE, path1)) != 0)
+ goto out_errno;
+ if ((error = task_conv(t, newpath, VWRITE, path2)) != 0)
+ goto out_errno;
+
+ error = sys_link(path1, path2);
+ if (error)
+ goto out_errno;
+ trace_vfs_link_ret();
+ return 0;
+ out_errno:
+ trace_vfs_link_err(error);
+ errno = error;
+ return -1;
+}
+
+
+TRACEPOINT(trace_vfs_symlink, "oldpath=%s, newpath=%s", const char*, const
char*);
+TRACEPOINT(trace_vfs_symlink_ret, "");
+TRACEPOINT(trace_vfs_symlink_err, "errno=%d", int);
+
+int symlink(const char *oldpath, const char *newpath)
+{
+ int error;
+
+ trace_vfs_symlink(oldpath, newpath);
+
+ error = ENOENT;
+ if (oldpath == nullptr || newpath == nullptr) {
+ errno = ENOENT;
+ trace_vfs_symlink_err(error);
+ return (-1);
+ }
+
+ error = sys_symlink(oldpath, newpath);
+ if (error) {
+ errno = error;
+ trace_vfs_symlink_err(error);
+ return (-1);
+ }
+
+ trace_vfs_symlink_ret();
+ return 0;
+}
+
+TRACEPOINT(trace_vfs_unlink, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_unlink_ret, "");
+TRACEPOINT(trace_vfs_unlink_err, "%d", int);
+
+int unlink(const char *pathname)
+{
+ trace_vfs_unlink(pathname);
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int error;
+
+ error = ENOENT;
+ if (pathname == nullptr)
+ goto out_errno;
+ if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+ goto out_errno;
+
+ error = sys_unlink(path);
+ if (error)
+ goto out_errno;
+ trace_vfs_unlink_ret();
+ return 0;
+ out_errno:
+ trace_vfs_unlink_err(error);
+ errno = error;
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_stat, "\"%s\" %p", const char*, struct stat*);
+TRACEPOINT(trace_vfs_stat_ret, "");
+TRACEPOINT(trace_vfs_stat_err, "%d", int);
+
+extern "C"
+int __xstat(int ver, const char *pathname, struct stat *st)
+{
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int error;
+
+ trace_vfs_stat(pathname, st);
+
+ error = task_conv(t, pathname, 0, path);
+ if (error)
+ goto out_errno;
+
+ error = sys_stat(path, st);
+ if (error)
+ goto out_errno;
+ trace_vfs_stat_ret();
+ return 0;
+
+ out_errno:
+ trace_vfs_stat_err(error);
+ errno = error;
+ return -1;
+}
+
+LFS64(__xstat);
+
+int stat(const char *pathname, struct stat *st)
+{
+ return __xstat(1, pathname, st);
+}
+
+LFS64(stat);
+
+TRACEPOINT(trace_vfs_lstat, "pathname=%s, stat=%p", const char*, struct stat*);
+TRACEPOINT(trace_vfs_lstat_ret, "");
+TRACEPOINT(trace_vfs_lstat_err, "errno=%d", int);
+extern "C"
+int __lxstat(int ver, const char *pathname, struct stat *st)
+{
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int error;
+
+ trace_vfs_lstat(pathname, st);
+
+ error = task_conv(t, pathname, 0, path);
+ if (error) {
+ errno = error;
+ trace_vfs_lstat_err(error);
+ return (-1);
+ }
+
+ error = sys_lstat(path, st);
+ if (error) {
+ errno = error;
+ trace_vfs_lstat_err(error);
+ return (-1);
+ }
+
+ trace_vfs_lstat_ret();
+ return 0;
+}
+
+LFS64(__lxstat);
+
+int lstat(const char *pathname, struct stat *st)
+{
+ return __lxstat(1, pathname, st);
+}
+
+LFS64(lstat);
+
+TRACEPOINT(trace_vfs_statfs, "\"%s\" %p", const char*, struct statfs*);
+TRACEPOINT(trace_vfs_statfs_ret, "");
+TRACEPOINT(trace_vfs_statfs_err, "%d", int);
+
+extern "C"
+int __statfs(const char *pathname, struct statfs *buf)
+{
+ trace_vfs_statfs(pathname, buf);
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int error;
+
+ error = task_conv(t, pathname, 0, path);
+ if (error)
+ goto out_errno;
+
+ error = sys_statfs(path, buf);
+ if (error)
+ goto out_errno;
+ trace_vfs_statfs_ret();
+ return 0;
+ out_errno:
+ trace_vfs_statfs_err(error);
+ errno = error;
+ return -1;
+}
+weak_alias(__statfs, statfs);
+
+LFS64(statfs);
+
+TRACEPOINT(trace_vfs_fstatfs, "\"%s\" %p", int, struct statfs*);
+TRACEPOINT(trace_vfs_fstatfs_ret, "");
+TRACEPOINT(trace_vfs_fstatfs_err, "%d", int);
+
+extern "C"
+int __fstatfs(int fd, struct statfs *buf)
+{
+ struct file *fp;
+ int error;
+
+ trace_vfs_fstatfs(fd, buf);
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = sys_fstatfs(fp, buf);
+ fdrop(fp);
+
+ if (error)
+ goto out_errno;
+ trace_vfs_fstatfs_ret();
+ return 0;
+
+ out_errno:
+ trace_vfs_fstatfs_err(error);
+ errno = error;
+ return -1;
+}
+weak_alias(__fstatfs, fstatfs);
+
+LFS64(fstatfs);
+
+static int
+statfs_to_statvfs(struct statvfs *dst, struct statfs *src)
+{
+ dst->f_bsize = src->f_bsize;
+ dst->f_frsize = src->f_bsize;
+ dst->f_blocks = src->f_blocks;
+ dst->f_bfree = src->f_bfree;
+ dst->f_bavail = src->f_bavail;
+ dst->f_files = src->f_files;
+ dst->f_ffree = src->f_ffree;
+ dst->f_favail = 0;
+ dst->f_fsid = src->f_fsid.__val[0];
+ dst->f_flag = src->f_flags;
+ dst->f_namemax = src->f_namelen;
+ return 0;
+}
+
+int
+statvfs(const char *pathname, struct statvfs *buf)
+{
+ struct statfs st;
+
+ if (__statfs(pathname, &st) < 0)
+ return -1;
+ return statfs_to_statvfs(buf, &st);
+}
+
+LFS64(statvfs);
+
+int
+fstatvfs(int fd, struct statvfs *buf)
+{
+ struct statfs st;
+
+ if (__fstatfs(fd, &st) < 0)
+ return -1;
+ return statfs_to_statvfs(buf, &st);
+}
+
+LFS64(fstatvfs);
+
+
+TRACEPOINT(trace_vfs_getcwd, "%p %d", char*, size_t);
+TRACEPOINT(trace_vfs_getcwd_ret, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_getcwd_err, "%d", int);
+
+char *getcwd(char *path, size_t size)
+{
+ trace_vfs_getcwd(path, size);
+ struct task *t = main_task;
+ int len = strlen(t->t_cwd) + 1;
+ int error;
+
+ if (!path) {
+ if (!size)
+ size = len;
+ path = (char*)malloc(size);
+ if (!path) {
+ error = ENOMEM;
+ goto out_errno;
+ }
+ } else {
+ if (!size) {
+ error = EINVAL;
+ goto out_errno;
+ }
+ }
+
+ if (size < len) {
+ error = ERANGE;
+ goto out_errno;
+ }
+
+ memcpy(path, t->t_cwd, len);
+ trace_vfs_getcwd_ret(path);
+ return path;
+
+ out_errno:
+ trace_vfs_getcwd_err(error);
+ errno = error;
+ return nullptr;
+}
+
+TRACEPOINT(trace_vfs_dup, "%d", int);
+TRACEPOINT(trace_vfs_dup_ret, "\"%s\"", int);
+TRACEPOINT(trace_vfs_dup_err, "%d", int);
+/*
+ * Duplicate a file descriptor
+ */
+int dup(int oldfd)
+{
+ struct file *fp;
+ int newfd;
+ int error;
+
+ trace_vfs_dup(oldfd);
+ error = fget(oldfd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = fdalloc(fp, &newfd);
+ if (error)
+ goto out_fdrop;
+
+ fdrop(fp);
+ trace_vfs_dup_ret(newfd);
+ return newfd;
+
+ out_fdrop:
+ fdrop(fp);
+ out_errno:
+ trace_vfs_dup_err(error);
+ errno = error;
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_dup3, "%d %d 0x%x", int, int, int);
+TRACEPOINT(trace_vfs_dup3_ret, "%d", int);
+TRACEPOINT(trace_vfs_dup3_err, "%d", int);
+/*
+ * Duplicate a file descriptor to a particular value.
+ */
+int dup3(int oldfd, int newfd, int flags)
+{
+ struct file *fp;
+ int error;
+
+ trace_vfs_dup3(oldfd, newfd, flags);
+ /*
+ * Don't allow any argument but O_CLOEXEC. But we even ignore
+ * that as we don't support exec() and thus don't care.
+ */
+ if ((flags & ~O_CLOEXEC) != 0) {
+ error = EINVAL;
+ goto out_errno;
+ }
+
+ if (oldfd == newfd) {
+ error = EINVAL;
+ goto out_errno;
+ }
+
+ error = fget(oldfd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = fdset(newfd, fp);
+ if (error) {
+ fdrop(fp);
+ goto out_errno;
+ }
+
+ fdrop(fp);
+ trace_vfs_dup3_ret(newfd);
+ return newfd;
+
+ out_errno:
+ trace_vfs_dup3_err(error);
+ errno = error;
+ return -1;
+}
+
+int dup2(int oldfd, int newfd)
+{
+ if (oldfd == newfd)
+ return newfd;
+
+ return dup3(oldfd, newfd, 0);
+}
+
+/*
+ * The file control system call.
+ */
+#define SETFL (O_APPEND | O_ASYNC | O_DIRECT | O_NOATIME | O_NONBLOCK)
+
+TRACEPOINT(trace_vfs_fcntl, "%d %d 0x%x", int, int, int);
+TRACEPOINT(trace_vfs_fcntl_ret, "\"%s\"", int);
+TRACEPOINT(trace_vfs_fcntl_err, "%d", int);
+
+extern "C"
+int fcntl(int fd, int cmd, int arg)
+{
+ struct file *fp;
+ int ret = 0, error;
+ int tmp;
+
+ trace_vfs_fcntl(fd, cmd, arg);
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ // An important note about our handling of FD_CLOEXEC / O_CLOEXEC:
+ // close-on-exec shouldn't have been a file flag (fp->f_flags) - it is a
+ // file descriptor flag, meaning that that two dup()ed file descriptors
+ // could have different values for FD_CLOEXEC. Our current implementation
+ // *wrongly* makes close-on-exec an f_flag (using the bit O_CLOEXEC).
+ // There is little practical difference, though, because this flag is
+ // ignored in OSv anyway, as it doesn't support exec().
+ switch (cmd) {
+ case F_DUPFD:
+ error = _fdalloc(fp, &ret, arg);
+ if (error)
+ goto out_errno;
+ break;
+ case F_GETFD:
+ ret = (fp->f_flags & O_CLOEXEC) ? FD_CLOEXEC : 0;
+ break;
+ case F_SETFD:
+ FD_LOCK(fp);
+ fp->f_flags = (fp->f_flags & ~O_CLOEXEC) |
+ ((arg & FD_CLOEXEC) ? O_CLOEXEC : 0);
+ FD_UNLOCK(fp);
+ break;
+ case F_GETFL:
+ // As explained above, the O_CLOEXEC should have been in f_flags,
+ // and shouldn't be returned. Linux always returns 0100000 ("the
+ // flag formerly known as O_LARGEFILE) so let's do it too.
+ ret = (oflags(fp->f_flags) & ~O_CLOEXEC) | 0100000;
+ break;
+ case F_SETFL:
+ FD_LOCK(fp);
+ fp->f_flags = fflags((oflags(fp->f_flags) & ~SETFL) |
+ (arg & SETFL));
+ FD_UNLOCK(fp);
+
+ /* Sync nonblocking/async state with file flags */
+ tmp = fp->f_flags & FNONBLOCK;
+ fp->ioctl(FIONBIO, &tmp);
+ tmp = fp->f_flags & FASYNC;
+ fp->ioctl(FIOASYNC, &tmp);
+
+ break;
+ case F_SETLK:
+ WARN_ONCE("fcntl(F_SETLK) stubbed\n");
+ break;
+ case F_GETLK:
+ WARN_ONCE("fcntl(F_GETLK) stubbed\n");
+ break;
+ case F_SETLKW:
+ WARN_ONCE("fcntl(F_SETLKW) stubbed\n");
+ break;
+ case F_SETOWN:
+ WARN_ONCE("fcntl(F_SETOWN) stubbed\n");
+ break;
+ default:
+ kprintf("unsupported fcntl cmd 0x%x\n", cmd);
+ error = EINVAL;
+ }
+
+ fdrop(fp);
+ if (error)
+ goto out_errno;
+ trace_vfs_fcntl_ret(ret);
+ return ret;
+
+ out_errno:
+ trace_vfs_fcntl_err(error);
+ errno = error;
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_access, "\"%s\" 0%0o", const char*, int);
+TRACEPOINT(trace_vfs_access_ret, "");
+TRACEPOINT(trace_vfs_access_err, "%d", int);
+
+/*
+ * Check permission for file access
+ */
+int access(const char *pathname, int mode)
+{
+ trace_vfs_access(pathname, mode);
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int acc, error = 0;
+
+ acc = 0;
+ if (mode & R_OK)
+ acc |= VREAD;
+ if (mode & W_OK)
+ acc |= VWRITE;
+
+ if ((error = task_conv(t, pathname, acc, path)) != 0)
+ goto out_errno;
+
+ error = sys_access(path, mode);
+ if (error)
+ goto out_errno;
+ trace_vfs_access_ret();
+ return 0;
+ out_errno:
+ errno = error;
+ trace_vfs_access_err(error);
+ return -1;
+}
+
+int faccessat(int dirfd, const char *pathname, int mode, int flags)
+{
+ if (flags & AT_SYMLINK_NOFOLLOW) {
+ UNIMPLEMENTED("faccessat() with AT_SYMLINK_NOFOLLOW");
+ }
+
+ if (pathname[0] == '/' || dirfd == AT_FDCWD) {
+ return access(pathname, mode);
+ }
+
+ struct file *fp;
+ int error = fget(dirfd, &fp);
+ if (error) {
+ errno = error;
+ return -1;
+ }
+
+ struct vnode *vp = fp->f_dentry->d_vnode;
+ vn_lock(vp);
+
+ std::unique_ptr<char []> up (new char[PATH_MAX]);
+ char *p = up.get();
+
+ /* build absolute path */
+ strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX);
+ strlcat(p, fp->f_dentry->d_path, PATH_MAX);
+ strlcat(p, "/", PATH_MAX);
+ strlcat(p, pathname, PATH_MAX);
+
+ error = access(p, mode);
+
+ vn_unlock(vp);
+ fdrop(fp);
+
+ return error;
+}
+
+extern "C"
+int euidaccess(const char *pathname, int mode)
+{
+ return access(pathname, mode);
+}
+
+weak_alias(euidaccess,eaccess);
+
+#if 0
+static int
+fs_pipe(struct task *t, struct msg *msg)
+{
+#ifdef CONFIG_FIFOFS
+ char path[PATH_MAX];
+ file_t rfp, wfp;
+ int error, rfd, wfd;
+
+ DPRINTF(VFSDB_CORE, ("fs_pipe\n"));
+
+ if ((rfd = task_newfd(t)) == -1)
+ return EMFILE;
+ t->t_ofile[rfd] = (file_t)1; /* temp */
+
+ if ((wfd = task_newfd(t)) == -1) {
+ t->t_ofile[rfd] = nullptr;
+ return EMFILE;
+ }
+ sprintf(path, "/mnt/fifo/pipe-%x-%d", (u_int)t->t_taskid, rfd);
+
+ if ((error = sys_mknod(path, S_IFIFO)) != 0)
+ goto out;
+ if ((error = sys_open(path, O_RDONLY | O_NONBLOCK, 0, &rfp)) != 0) {
+ goto out;
+ }
+ if ((error = sys_open(path, O_WRONLY | O_NONBLOCK, 0, &wfp)) != 0) {
+ goto out;
+ }
+ t->t_ofile[rfd] = rfp;
+ t->t_ofile[wfd] = wfp;
+ t->t_nopens += 2;
+ msg->data[0] = rfd;
+ msg->data[1] = wfd;
+ return 0;
+ out:
+ t->t_ofile[rfd] = nullptr;
+ t->t_ofile[wfd] = nullptr;
+ return error;
+#else
+ return ENOSYS;
+#endif
+}
+#endif
+
+TRACEPOINT(trace_vfs_isatty, "%d", int);
+TRACEPOINT(trace_vfs_isatty_ret, "%d", int);
+TRACEPOINT(trace_vfs_isatty_err, "%d", int);
+
+/*
+ * Return if specified file is a tty
+ */
+int isatty(int fd)
+{
+ struct file *fp;
+ int istty = 0;
+
+ trace_vfs_isatty(fd);
+ fileref f(fileref_from_fd(fd));
+ if (!f) {
+ errno = EBADF;
+ trace_vfs_isatty_err(errno);
+ return -1;
+ }
+
+ fp = f.get();
+ if (dynamic_cast<tty_file*>(fp) ||
+ (fp->f_dentry && fp->f_dentry->d_vnode->v_flags & VISTTY)) {
+ istty = 1;
+ }
+
+ trace_vfs_isatty_ret(istty);
+ return istty;
+}
+
+TRACEPOINT(trace_vfs_truncate, "\"%s\" 0x%x", const char*, off_t);
+TRACEPOINT(trace_vfs_truncate_ret, "");
+TRACEPOINT(trace_vfs_truncate_err, "%d", int);
+
+int truncate(const char *pathname, off_t length)
+{
+ trace_vfs_truncate(pathname, length);
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int error;
+
+ error = ENOENT;
+ if (pathname == nullptr)
+ goto out_errno;
+ if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+ goto out_errno;
+
+ error = sys_truncate(path, length);
+ if (error)
+ goto out_errno;
+ trace_vfs_truncate_ret();
+ return 0;
+ out_errno:
+ errno = error;
+ trace_vfs_truncate_err(error);
+ return -1;
+}
+
+LFS64(truncate);
+
+TRACEPOINT(trace_vfs_ftruncate, "%d 0x%x", int, off_t);
+TRACEPOINT(trace_vfs_ftruncate_ret, "");
+TRACEPOINT(trace_vfs_ftruncate_err, "%d", int);
+
+int ftruncate(int fd, off_t length)
+{
+ trace_vfs_ftruncate(fd, length);
+ struct file *fp;
+ int error;
+
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = sys_ftruncate(fp, length);
+ fdrop(fp);
+
+ if (error)
+ goto out_errno;
+ trace_vfs_ftruncate_ret();
+ return 0;
+
+ out_errno:
+ errno = error;
+ trace_vfs_ftruncate_err(error);
+ return -1;
+}
+
+LFS64(ftruncate);
+
+ssize_t readlink(const char *pathname, char *buf, size_t bufsize)
+{
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int error;
+ ssize_t size;
+
+ error = -EINVAL;
+ if (bufsize <= 0)
+ goto out_errno;
+
+ error = ENOENT;
+ if (pathname == nullptr)
+ goto out_errno;
+ error = task_conv(t, pathname, VWRITE, path);
+ if (error)
+ goto out_errno;
+
+ size = 0;
+ error = sys_readlink(path, buf, bufsize, &size);
+
+ if (error != 0)
+ goto out_errno;
+
+ return size;
+ out_errno:
+ errno = error;
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_fallocate, "%d %d 0x%x 0x%x", int, int, loff_t, loff_t);
+TRACEPOINT(trace_vfs_fallocate_ret, "");
+TRACEPOINT(trace_vfs_fallocate_err, "%d", int);
+
+int fallocate(int fd, int mode, loff_t offset, loff_t len)
+{
+ struct file *fp;
+ int error;
+
+ trace_vfs_fallocate(fd, mode, offset, len);
+ error = fget(fd, &fp);
+ if (error)
+ goto out_errno;
+
+ error = sys_fallocate(fp, mode, offset, len);
+ fdrop(fp);
+
+ if (error)
+ goto out_errno;
+ trace_vfs_fallocate_ret();
+ return 0;
+
+ out_errno:
+ trace_vfs_fallocate_err(error);
+ errno = error;
+ return -1;
+}
+
+LFS64(fallocate);
+
+TRACEPOINT(trace_vfs_utimes, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_utimes_ret, "");
+TRACEPOINT(trace_vfs_utimes_err, "%d", int);
+
+int futimes(int fd, const struct timeval times[2])
+{
+ return futimesat(fd, nullptr, times);
+}
+
+int futimesat(int dirfd, const char *pathname, const struct timeval times[2])
+{
+ struct stat st;
+ struct file *fp;
+ int error;
+ char *absolute_path;
+
+ if ((pathname && pathname[0] == '/') || dirfd == AT_FDCWD)
+ return utimes(pathname, times);
+
+ // Note: if pathname == nullptr, futimesat operates on dirfd itself, and in
+ // that case it doesn't have to be a directory.
+ if (pathname) {
+ error = fstat(dirfd, &st);
+ if (error) {
+ error = errno;
+ goto out_errno;
+ }
+
+ if (!S_ISDIR(st.st_mode)){
+ error = ENOTDIR;
+ goto out_errno;
+ }
+ }
+
+ error = fget(dirfd, &fp);
+ if (error)
+ goto out_errno;
+
+ /* build absolute path */
+ absolute_path = (char*)malloc(PATH_MAX);
+ strlcpy(absolute_path, fp->f_dentry->d_mount->m_path, PATH_MAX);
+ strlcat(absolute_path, fp->f_dentry->d_path, PATH_MAX);
+
+ if (pathname) {
+ strlcat(absolute_path, "/", PATH_MAX);
+ strlcat(absolute_path, pathname, PATH_MAX);
+ }
+
+ error = utimes(absolute_path, times);
+ free(absolute_path);
+
+ fdrop(fp);
+
+ if (error)
+ goto out_errno;
+ return 0;
+
+ out_errno:
+ errno = error;
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_utimensat, "\"%s\"", const char*);
+TRACEPOINT(trace_vfs_utimensat_ret, "");
+TRACEPOINT(trace_vfs_utimensat_err, "%d", int);
+
+extern "C"
+int utimensat(int dirfd, const char *pathname, const struct timespec times[2],
int flags)
+{
+ trace_vfs_utimensat(pathname);
+
+ auto error = sys_utimensat(dirfd, pathname, times, flags);
+ if (error) {
+ trace_vfs_utimensat_err(error);
+ errno = error;
+ return -1;
+ }
+
+ trace_vfs_utimensat_ret();
+ return 0;
+}
+
+TRACEPOINT(trace_vfs_futimens, "%d", int);
+TRACEPOINT(trace_vfs_futimens_ret, "");
+TRACEPOINT(trace_vfs_futimens_err, "%d", int);
+
+extern "C"
+int futimens(int fd, const struct timespec times[2])
+{
+ trace_vfs_futimens(fd);
+
+ auto error = sys_futimens(fd, times);
+ if (error) {
+ trace_vfs_futimens_err(error);
+ errno = error;
+ return -1;
+ }
+
+ trace_vfs_futimens_ret();
+ return 0;
+}
+
+static int do_utimes(const char *pathname, const struct timeval times[2], int
flags)
+{
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int error;
+
+ trace_vfs_utimes(pathname);
+
+ error = task_conv(t, pathname, 0, path);
+ if (error) {
+ trace_vfs_utimes_err(error);
+ return libc_error(error);
+ }
+
+ error = sys_utimes(path, times, flags);
+ if (error) {
+ trace_vfs_utimes_err(error);
+ return libc_error(error);
+ }
+
+ trace_vfs_utimes_ret();
+ return 0;
+}
+
+extern "C"
+int utimes(const char *pathname, const struct timeval times[2])
+{
+ return do_utimes(pathname, times, 0);
+}
+
+extern "C"
+int lutimes(const char *pathname, const struct timeval times[2])
+{
+ return do_utimes(pathname, times, AT_SYMLINK_NOFOLLOW);
+}
+
+extern "C"
+int utime(const char *pathname, const struct utimbuf *t)
+{
+ using namespace std::chrono;
+
+ struct timeval times[2];
+ times[0].tv_usec = 0;
+ times[1].tv_usec = 0;
+ if (!t) {
+ long int tsec =
duration_cast<seconds>(osv::clock::wall::now().time_since_epoch()).count();
+ times[0].tv_sec = tsec;
+ times[1].tv_sec = tsec;
+ } else {
+ times[0].tv_sec = t->actime;
+ times[1].tv_sec = t->modtime;
+ }
+
+ return utimes(pathname, times);
+}
+
+TRACEPOINT(trace_vfs_chmod, "\"%s\" 0%0o", const char*, mode_t);
+TRACEPOINT(trace_vfs_chmod_ret, "");
+TRACEPOINT(trace_vfs_chmod_err, "%d", int);
+
+int chmod(const char *pathname, mode_t mode)
+{
+ trace_vfs_chmod(pathname, mode);
+ struct task *t = main_task;
+ char path[PATH_MAX];
+ int error = ENOENT;
+ if (pathname == nullptr)
+ goto out_errno;
+ if ((error = task_conv(t, pathname, VWRITE, path)) != 0)
+ goto out_errno;
+ error = sys_chmod(path, mode & ALLPERMS);
+ if (error)
+ goto out_errno;
+ trace_vfs_chmod_ret();
+ return 0;
+out_errno:
+ trace_vfs_chmod_err(error);
+ errno = error;
+ return -1;
+}
+
+TRACEPOINT(trace_vfs_fchmod, "\"%d\" 0%0o", int, mode_t);
+TRACEPOINT(trace_vfs_fchmod_ret, "");
+
+int fchmod(int fd, mode_t mode)
+{
+ trace_vfs_fchmod(fd, mode);
+ auto error = sys_fchmod(fd, mode & ALLPERMS);
+ trace_vfs_fchmod_ret();
+ if (error) {
+ errno = error;
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+TRACEPOINT(trace_vfs_fchown, "\"%d\" %d %d", int, uid_t, gid_t);
+TRACEPOINT(trace_vfs_fchown_ret, "");
+
+int fchown(int fd, uid_t owner, gid_t group)
+{
+ trace_vfs_fchown(fd, owner, group);
+ WARN_STUBBED();
+ trace_vfs_fchown_ret();
+ return 0;
+}
+
+int chown(const char *path, uid_t owner, gid_t group)
+{
+ WARN_STUBBED();
+ return 0;
+}
+
+int lchown(const char *path, uid_t owner, gid_t group)
+{
+ WARN_STUBBED();
+ return 0;
+}
+
+
+ssize_t sendfile(int out_fd, int in_fd, off_t *_offset, size_t count)
+{
+ struct file *in_fp;
+ struct file *out_fp;
+ fileref in_f{fileref_from_fd(in_fd)};
+ fileref out_f{fileref_from_fd(out_fd)};
+
+ if (!in_f || !out_f) {
+ return libc_error(EBADF);
+ }
+
+ in_fp = in_f.get();
+ out_fp = out_f.get();
+
+ if (!in_fp->f_dentry) {
+ return libc_error(EBADF);
+ }
+
+ if (!(in_fp->f_flags & FREAD)) {
+ return libc_error(EBADF);
+ }
+
+ if (out_fp->f_type & DTYPE_VNODE) {
+ if (!out_fp->f_dentry) {
+ return libc_error(EBADF);
+ } else if (!(out_fp->f_flags & FWRITE)) {
+ return libc_error(EBADF);
+ }
+ }
+
+ off_t offset ;
+
+ if (_offset != nullptr) {
+ offset = *_offset;
+ } else {
+ /* if _offset is nullptr, we need to read from the present position of
in_fd */
+ offset = lseek(in_fd, 0, SEEK_CUR);
+ }
+
+ // Constrain count to the extent of the file...
+ struct stat st;
+ if (fstat(in_fd, &st) < 0) {
+ return -1;
+ } else {
+ if (offset >= st.st_size) {
+ return 0;
+ } else if ((offset + count) >= st.st_size) {
+ count = st.st_size - offset;
+ if (count == 0) {
+ return 0;
+ }
+ }
+ }
+
+ size_t bytes_to_mmap = count + (offset % mmu::page_size);
+ off_t offset_for_mmap = align_down(offset, (off_t)mmu::page_size);
+
+ char *src = static_cast<char *>(mmap(nullptr, bytes_to_mmap, PROT_READ,
MAP_SHARED, in_fd, offset_for_mmap));
+
+ if (src == MAP_FAILED) {
+ return -1;
+ }
+
+ auto ret = write(out_fd, src + (offset % PAGESIZE), count);
+
+ if (ret < 0) {
+ return libc_error(errno);
+ } else if(_offset == nullptr) {
+ lseek(in_fd, ret, SEEK_CUR);
+ } else {
+ *_offset += ret;
+ }
+
+ assert(munmap(src, count) == 0);
+
+ return ret;
+}
+
+#undef sendfile64
+LFS64(sendfile);
+
+NO_SYS(int fchmodat(int dirfd, const char *pathname, mode_t mode, int flags));
+
+mode_t umask(mode_t newmask)
+{
+ return global_umask.exchange(newmask, std::memory_order_relaxed);
+}
+
+int
+fs_noop(void)
+{
+ return 0;
+}
+
+int chroot(const char *path)
+{
+ WARN_STUBBED();
+ errno = ENOSYS;
+ return -1;
+}
+
+// unpack_bootfs() unpacks a collection of files stored as part of the OSv
+// executable (in memory location "bootfs_start") into the file system,
+// normally the in-memory filesystem ramfs.
+// The files are packed in the executable in an ad-hoc format defined here.
+// Code in scripts/mkbootfs.py packs files into this format.
+#define BOOTFS_PATH_MAX 111
+enum class bootfs_file_type : char { other = 0, symlink = 1 };
+struct bootfs_metadata {
+ uint64_t size;
+ uint64_t offset;
+ // The file's type. Can be "symlink" or "other". A directory is an "other"
+ // file with its name ending with a "/" (and no content).
+ bootfs_file_type type;
+ // name must end with a null. For symlink files, the content must end
+ // with a null as well.
+ char name[BOOTFS_PATH_MAX];
+};
+
+extern char bootfs_start;
+
+int ramfs_set_file_data(struct vnode *vp, const void *data, size_t size);
+void unpack_bootfs(void)
+{
+ struct bootfs_metadata *md = (struct bootfs_metadata *)&bootfs_start;
+ int fd, i;
+
+ for (i = 0; md[i].name[0]; i++) {
+ int ret;
+ char *p;
+
+ // mkdir() directories needed for this path name, as necessary
+ char tmp[BOOTFS_PATH_MAX];
+ strlcpy(tmp, md[i].name, BOOTFS_PATH_MAX);
+ for (p = tmp; *p; ++p) {
+ if (*p == '/') {
+ *p = '\0';
+ mkdir(tmp, 0666); // silently ignore errors and existing dirs
+ *p = '/';
+ }
+ }
+
+ if (md[i].type == bootfs_file_type::symlink) {
+ // This is a symbolic link record. The file's content is the
+ // target path, and we assume ends with a null.
+ if (symlink(&bootfs_start + md[i].offset, md[i].name) != 0) {
+ kprintf("couldn't symlink %s: %d\n", md[i].name, errno);
+ sys_panic("unpack_bootfs failed");
+ }
+ continue;
+ }
+ if (*(p-1) == '/' && md[i].size == 0) {
+ // This is directory record. Nothing else to do
+ continue;
+ }
+
+ fd = creat(md[i].name, 0666);
+ if (fd < 0) {
+ kprintf("couldn't create %s: %d\n",
+ md[i].name, errno);
+ sys_panic("unpack_bootfs failed");
+ }
+
+ struct file *fp;
+ int error = fget(fd, &fp);
+ if (error) {
+ kprintf("couldn't fget %s: %d\n",
+ md[i].name, error);
+ sys_panic("unpack_bootfs failed");
+ }
+
+ struct vnode *vp = fp->f_dentry->d_vnode;
+ ret = ramfs_set_file_data(vp, &bootfs_start + md[i].offset,
md[i].size);
+ if (ret) {
+ kprintf("ramfs_set_file_data failed, ret = %d\n", ret);
+ sys_panic("unpack_bootfs failed");
+ }
+
+ fdrop(fp);
+ close(fd);
+ }
+}
+
+void mount_rootfs(void)
+{
+ int ret;
+
+ ret = sys_mount("", "/", "ramfs", 0, nullptr);
+ if (ret)
+ kprintf("failed to mount rootfs, error = %s\n", strerror(ret));
+
+ if (mkdir("/dev", 0755) < 0)
+ kprintf("failed to create /dev, error = %s\n", strerror(errno));
+
+ ret = sys_mount("", "/dev", "devfs", 0, nullptr);
+ if (ret)
+ kprintf("failed to mount devfs, error = %s\n", strerror(ret));
+}
+
+extern "C"
+int nmount(struct iovec *iov, unsigned niov, int flags)
+{
+ struct args {
+ char* fstype = nullptr;
+ char* fspath = nullptr;
+ char* from = nullptr;
+ };
+ static unordered_map<string, char* args::*> argmap {
+ { "fstype", &args::fstype },
+ { "fspath", &args::fspath },
+ { "from", &args::from },
+ };
+ args a;
+ for (size_t i = 0; i < niov; i += 2) {
+ std::string s(static_cast<const char*>(iov[i].iov_base));
+ if (argmap.count(s)) {
+ a.*(argmap[s]) = static_cast<char*>(iov[i+1].iov_base);
+ }
+ }
+ return sys_mount(a.from, a.fspath, a.fstype, flags, nullptr);
+}
+
+static void import_extra_zfs_pools(void)
+{
+ struct stat st;
+ int ret;
+
+ // The file '/etc/mnttab' is a LibZFS requirement and will not
+ // exist during cpiod phase. The functionality provided by this
+ // function isn't needed during that phase, so let's skip it.
+ if (stat("/etc/mnttab" , &st) != 0) {
+ return;
+ }
+
+ // Import extra pools mounting datasets there contained.
+ // Datasets from osv pool will not be mounted here.
+ if (access("zpool.so", X_OK) != 0) {
+ return;
+ }
+ vector<string> zpool_args = {"zpool", "import", "-f", "-a" };
+ auto ok = osv::run("zpool.so", zpool_args, &ret);
+ assert(ok);
+
+ if (!ret) {
+ debug("zfs: extra ZFS pool(s) found.\n");
+ }
+}
+
+void pivot_rootfs(const char* path)
+{
+ int ret = sys_pivot_root(path, "/");
+ if (ret)
+ kprintf("failed to pivot root, error = %s\n", strerror(ret));
+
+ auto ent = setmntent("/etc/fstab", "r");
+ if (!ent) {
+ return;
+ }
+
+ struct mntent *m = nullptr;
+ while ((m = getmntent(ent)) != nullptr) {
+ if (!strcmp(m->mnt_dir, "/")) {
+ continue;
+ }
+
+ if ((m->mnt_opts != nullptr) && strcmp(m->mnt_opts, MNTOPT_DEFAULTS)) {
+ printf("Warning: opts %s, ignored for fs %s\n", m->mnt_opts,
m->mnt_type);
+ }
+
+ // FIXME: Right now, ignoring mntops. In the future we may have an
option parser
+ ret = sys_mount(m->mnt_fsname, m->mnt_dir, m->mnt_type, 0, nullptr);
+ if (ret) {
+ printf("failed to mount %s, error = %s\n", m->mnt_type,
strerror(ret));
+ }
+ }
+ endmntent(ent);
+}
+
+extern "C" void unmount_devfs()
+{
+ int ret = sys_umount("/dev");
+ if (ret)
+ kprintf("failed to unmount /dev, error = %s\n", strerror(ret));
+}
+
+extern "C" int mount_rofs_rootfs(bool pivot_root)
+{
+ int ret;
+
+ if (mkdir("/rofs", 0755) < 0)
+ kprintf("failed to create /rofs, error = %s\n", strerror(errno));
+
+ ret = sys_mount("/dev/vblk0.1", "/rofs", "rofs", MNT_RDONLY, 0);
+
+ if (ret) {
+ kprintf("failed to mount /rofs, error = %s\n", strerror(ret));
+ rmdir("/rofs");
+ return ret;
+ }
+
+ if (pivot_root) {
+ pivot_rootfs("/rofs");
+ }
+
+ return 0;
+}
+
+extern "C" void mount_zfs_rootfs(bool pivot_root)
+{
+ if (mkdir("/zfs", 0755) < 0)
+ kprintf("failed to create /zfs, error = %s\n", strerror(errno));
+
+ int ret = sys_mount("/dev/vblk0.1", "/zfs", "zfs", 0, (void *)"osv/zfs");
+
+ if (ret)
+ kprintf("failed to mount /zfs, error = %s\n", strerror(ret));
+
+ if (!pivot_root) {
+ return;
+ }
+
+ pivot_rootfs("/zfs");
+
+ import_extra_zfs_pools();
+}
+
+extern "C" void unmount_rootfs(void)
+{
+ int ret;
+
+ sys_umount("/dev");
+
+ ret = sys_umount("/proc");
+ if (ret) {
+ kprintf("Warning: unmount_rootfs: failed to unmount /proc, "
+ "error = %s\n", strerror(ret));
+ }
+
+ ret = sys_umount2("/", MNT_FORCE);
+ if (ret) {
+ kprintf("Warning: unmount_rootfs: failed to unmount /, "
+ "error = %s\n", strerror(ret));
+ }
+}
+
+extern "C" void bio_init(void);
+extern "C" void bio_sync(void);
+
+int vfs_initialized;
+
+extern "C"
+void
+vfs_init(void)
+{
+ const struct vfssw *fs;
+
+ bio_init();
+ lookup_init();
+ vnode_init();
+ task_alloc(&main_task);
+
+ /*
+ * Initialize each file system.
+ */
+ for (fs = vfssw; fs->vs_name; fs++) {
+ if (fs->vs_init) {
+ DPRINTF(VFSDB_CORE, ("VFS: initializing %s\n",
+ fs->vs_name));
+ fs->vs_init();
+ }
+ }
+
+ mount_rootfs();
+ unpack_bootfs();
+
+ // if (open("/dev/console", O_RDWR, 0) != 0)
+ if (console::open() != 0)
+ kprintf("failed to open console, error = %d\n", errno);
+ if (dup(0) != 1)
+ kprintf("failed to dup console (1)\n");
+ if (dup(0) != 2)
+ kprintf("failed to dup console (2)\n");
+ vfs_initialized = 1;
+}
+
+void vfs_exit(void)
+{
+ /* Free up main_task (stores cwd data) resources */
+ replace_cwd(main_task, nullptr, []() { return 0; });
+ /* Unmount all file systems */
+ unmount_rootfs();
+ /* Finish with the bio layer */
+ bio_sync();
+}
+
+void sys_panic(const char *str)
+{
+ abort("panic: %s", str);
+}
+
diff --git a/lib/vfscore/mount.c b/lib/vfscore/mount.c
new file mode 100644
index 00000000..dac4d09c
--- /dev/null
+++ b/lib/vfscore/mount.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * vfs_mount.c - mount operations
+ */
+
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <dirent.h>
+
+#include <limits.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <osv/prex.h>
+#include <osv/vnode.h>
+#include <osv/device.h>
+#include <osv/debug.h>
+#include <osv/mutex.h>
+#include "vfs.h"
+
+#include <memory>
+#include <list>
+
+/*
+ * List for VFS mount points.
+ */
+static std::list<mount*> mount_list;
+
+/*
+ * Global lock to access mount point.
+ */
+static mutex mount_lock;
+
+/*
+ * Lookup file system.
+ */
+static const struct vfssw *
+fs_getfs(const char *name)
+{
+ const struct vfssw *fs;
+
+ for (fs = vfssw; fs->vs_name; fs++) {
+ if (!strncmp(name, fs->vs_name, FSMAXNAMES))
+ break;
+ }
+ if (!fs->vs_name)
+ return nullptr;
+ return fs;
+}
+
+const char*
+fs_getfsname(vfsops* ops)
+{
+ for (auto fs = vfssw; fs->vs_name; fs++) {
+ if (fs->vs_op == ops) {
+ return fs->vs_name;
+ }
+ }
+ abort();
+}
+
+int
+sys_mount(const char *dev, const char *dir, const char *fsname, int flags,
const void *data)
+{
+ const struct vfssw *fs;
+ struct mount *mp;
+ struct device *device;
+ struct dentry *dp_covered;
+ struct vnode *vp;
+ int error;
+
+ kprintf("VFS: mounting %s at %s\n", fsname, dir);
+
+ if (!dir || *dir == '\0')
+ return ENOENT;
+
+ /* Find a file system. */
+ if (!(fs = fs_getfs(fsname)))
+ return ENODEV; /* No such file system */
+
+ /* Open device. nullptr can be specified as a device. */
+ // Allow device_open() to fail, in which case dev is interpreted
+ // by the file system mount routine (e.g zfs pools)
+ device = 0;
+ if (dev && strncmp(dev, "/dev/", 5) == 0)
+ device_open(dev + 5, DO_RDWR, &device);
+
+ /* Check if device or directory has already been mounted. */
+ // We need to avoid the situation where after we already verified that
+ // the mount point is free, but before we actually add it to mount_list,
+ // another concurrent mount adds it. So we use a new mutex to ensure
+ // that only one sys_mount() runs at a time. We cannot reuse the existing
+ // mount_lock for this purpose: If we take mount_lock and then do
+ // lookups, this is lock order inversion and can result in deadlock.
+ static mutex sys_mount_lock;
+ SCOPE_LOCK(sys_mount_lock);
+ WITH_LOCK(mount_lock) {
+ for (auto&& mp : mount_list) {
+ if (!strcmp(mp->m_path, dir) ||
+ (device && mp->m_dev == device)) {
+ error = EBUSY; /* Already mounted */
+ goto err1;
+ }
+ }
+ }
+ /*
+ * Create VFS mount entry.
+ */
+ if (!(mp = new mount)) {
+ error = ENOMEM;
+ goto err1;
+ }
+ mp->m_count = 0;
+ mp->m_op = fs->vs_op;
+ mp->m_flags = flags;
+ mp->m_dev = device;
+ mp->m_data = nullptr;
+ strlcpy(mp->m_path, dir, sizeof(mp->m_path));
+ strlcpy(mp->m_special, dev, sizeof(mp->m_special));
+
+ /*
+ * Get vnode to be covered in the upper file system.
+ */
+ if (*dir == '/' && *(dir + 1) == '\0') {
+ /* Ignore if it mounts to global root directory. */
+ dp_covered = nullptr;
+ } else {
+ if ((error = namei(dir, &dp_covered)) != 0) {
+
+ error = ENOENT;
+ goto err2;
+ }
+ if (dp_covered->d_vnode->v_type != VDIR) {
+ error = ENOTDIR;
+ goto err3;
+ }
+ }
+ mp->m_covered = dp_covered;
+
+ /*
+ * Create a root vnode for this file system.
+ */
+ vget(mp, 0, &vp);
+ if (vp == nullptr) {
+ error = ENOMEM;
+ goto err3;
+ }
+ vp->v_type = VDIR;
+ vp->v_flags = VROOT;
+ vp->v_mode = S_IFDIR | S_IRUSR | S_IWUSR | S_IXUSR;
+
+ mp->m_root = dentry_alloc(nullptr, vp, "/");
+ if (!mp->m_root) {
+ vput(vp);
+ goto err3;
+ }
+ vput(vp);
+
+ /*
+ * Call a file system specific routine.
+ */
+ if ((error = VFS_MOUNT(mp, dev, flags, data)) != 0)
+ goto err4;
+
+ if (mp->m_flags & MNT_RDONLY)
+ vp->v_mode &=~S_IWUSR;
+
+ /*
+ * Insert to mount list
+ */
+ WITH_LOCK(mount_lock) {
+ mount_list.push_back(mp);
+ }
+
+ return 0; /* success */
+ err4:
+ drele(mp->m_root);
+ err3:
+ if (dp_covered)
+ drele(dp_covered);
+ err2:
+ delete mp;
+ err1:
+ if (device)
+ device_close(device);
+
+ return error;
+}
+
+void
+release_mp_dentries(struct mount *mp)
+{
+ /* Decrement referece count of root vnode */
+ if (mp->m_covered) {
+ drele(mp->m_covered);
+ }
+
+ /* Release root dentry */
+ drele(mp->m_root);
+}
+
+int
+sys_umount2(const char *path, int flags)
+{
+ struct mount *mp;
+ int error, pathlen;
+
+ kprintf("VFS: unmounting %s\n", path);
+
+ SCOPE_LOCK(mount_lock);
+
+ pathlen = strlen(path);
+ if (pathlen >= MAXPATHLEN) {
+ error = ENAMETOOLONG;
+ goto out;
+ }
+
+ /* Get mount entry */
+ for (auto&& tmp : mount_list) {
+ if (!strcmp(path, tmp->m_path)) {
+ mp = tmp;
+ goto found;
+ }
+ }
+
+ error = EINVAL;
+ goto out;
+
+found:
+ /*
+ * Root fs can not be unmounted.
+ */
+ if (mp->m_covered == nullptr && !(flags & MNT_FORCE)) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if ((error = VFS_UNMOUNT(mp, flags)) != 0)
+ goto out;
+ mount_list.remove(mp);
+
+#ifdef HAVE_BUFFERS
+ /* Flush all buffers */
+ binval(mp->m_dev);
+#endif
+
+ if (mp->m_dev)
+ device_close(mp->m_dev);
+ delete mp;
+ out:
+ return error;
+}
+
+int
+sys_umount(const char *path)
+{
+ return sys_umount2(path, 0);
+}
+
+int
+sys_pivot_root(const char *new_root, const char *put_old)
+{
+ struct mount *newmp = nullptr, *oldmp = nullptr;
+ int error;
+
+ WITH_LOCK(mount_lock) {
+ for (auto&& mp : mount_list) {
+ if (!strcmp(mp->m_path, new_root)) {
+ newmp = mp;
+ }
+ if (!strcmp(mp->m_path, put_old)) {
+ oldmp = mp;
+ }
+ }
+ if (!newmp || !oldmp || newmp == oldmp) {
+ return EINVAL;
+ }
+ for (auto&& mp : mount_list) {
+ if (mp == newmp || mp == oldmp) {
+ continue;
+ }
+ if (!strncmp(mp->m_path, put_old, strlen(put_old))) {
+ return EBUSY;
+ }
+ }
+ if ((error = VFS_UNMOUNT(oldmp, 0)) != 0) {
+ return error;
+ }
+ mount_list.remove(oldmp);
+
+ newmp->m_root->d_vnode->v_mount = newmp;
+
+ if (newmp->m_covered) {
+ drele(newmp->m_covered);
+ }
+ newmp->m_covered = nullptr;
+
+ if (newmp->m_root->d_parent) {
+ drele(newmp->m_root->d_parent);
+ }
+ newmp->m_root->d_parent = nullptr;
+
+ strlcpy(newmp->m_path, "/", sizeof(newmp->m_path));
+ }
+ return 0;
+}
+
+int
+sys_sync(void)
+{
+ /* Call each mounted file system. */
+ WITH_LOCK(mount_lock) {
+ for (auto&& mp : mount_list) {
+ VFS_SYNC(mp);
+ }
+ }
+#ifdef HAVE_BUFFERS
+ bio_sync();
+#endif
+ return 0;
+}
+
+/*
+ * Compare two path strings. Return matched length.
+ * @path: target path.
+ * @root: vfs root path as mount point.
+ */
+static size_t
+count_match(const char *path, char *mount_root)
+{
+ size_t len = 0;
+
+ while (*path && *mount_root) {
+ if (*path != *mount_root)
+ break;
+
+ path++;
+ mount_root++;
+ len++;
+ }
+ if (*mount_root != '\0')
+ return 0;
+
+ if (len == 1 && *(path - 1) == '/')
+ return 1;
+
+ if (*path == '\0' || *path == '/')
+ return len;
+ return 0;
+}
+
+/*
+ * Get the root directory and mount point for specified path.
+ * @path: full path.
+ * @mp: mount point to return.
+ * @root: pointer to root directory in path.
+ */
+int
+vfs_findroot(const char *path, struct mount **mp, char **root)
+{
+ struct mount *m = nullptr;
+ size_t len, max_len = 0;
+
+ if (!path)
+ return -1;
+
+ /* Find mount point from nearest path */
+ SCOPE_LOCK(mount_lock);
+ for (auto&& tmp : mount_list) {
+ len = count_match(path, tmp->m_path);
+ if (len > max_len) {
+ max_len = len;
+ m = tmp;
+ }
+ }
+ if (m == nullptr)
+ return -1;
+ *root = (char *)(path + max_len);
+ if (**root == '/')
+ (*root)++;
+ *mp = m;
+ return 0;
+}
+
+/*
+ * Mark a mount point as busy.
+ */
+void
+vfs_busy(struct mount *mp)
+{
+ SCOPE_LOCK(mount_lock);
+ mp->m_count++;
+}
+
+
+/*
+ * Mark a mount point as busy.
+ */
+void
+vfs_unbusy(struct mount *mp)
+{
+ SCOPE_LOCK(mount_lock);
+ mp->m_count--;
+}
+
+int
+vfs_nullop(void)
+{
+ return 0;
+}
+
+int
+vfs_einval(void)
+{
+ return EINVAL;
+}
+
+namespace osv {
+
+mount_desc to_mount_desc(mount* m)
+{
+ mount_desc ret;
+ ret.special = m->m_special;
+ ret.path = m->m_path;
+ ret.type = fs_getfsname(m->m_op);
+ // FIXME: record options
+ ret.options = "";
+ return ret;
+}
+
+std::vector<mount_desc>
+current_mounts()
+{
+ WITH_LOCK(mount_lock) {
+ std::vector<mount_desc> ret;
+ for (auto&& mp : mount_list) {
+ ret.push_back(to_mount_desc(mp));
+ }
+ return ret;
+ }
+}
+
+}
+
+#ifdef DEBUG_VFS
+void
+mount_dump(void)
+{
+ SCOPE_LOCK(mount_lock);
+
+ kprintf("mount_dump\n");
+ kprintf("dev count root\n");
+ kprintf("-------- ----- --------\n");
+
+ for (auto&& mp : mount_list) {
+ kprintf("%8x %5d %s\n", mp->m_dev, mp->m_count, mp->m_path);
+ }
+}
+#endif
diff --git a/lib/vfscore/subr_uio.c b/lib/vfscore/subr_uio.c
new file mode 100644
index 00000000..bf138b8e
--- /dev/null
+++ b/lib/vfscore/subr_uio.c
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <osv/uio.h>
+
+int
+uiomove(void *cp, int n, struct uio *uio)
+{
+ assert(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE);
+
+ while (n > 0 && uio->uio_resid) {
+ struct iovec *iov = uio->uio_iov;
+ int cnt = iov->iov_len;
+ if (cnt == 0) {
+ uio->uio_iov++;
+ uio->uio_iovcnt--;
+ continue;
+ }
+ if (cnt > n)
+ cnt = n;
+
+ if (uio->uio_rw == UIO_READ)
+ memcpy(iov->iov_base, cp, cnt);
+ else
+ memcpy(cp, iov->iov_base, cnt);
+
+ iov->iov_base = (char *)iov->iov_base + cnt;
+ iov->iov_len -= cnt;
+ uio->uio_resid -= cnt;
+ uio->uio_offset += cnt;
+ cp = (char *)cp + cnt;
+ n -= cnt;
+ }
+
+ return 0;
+}
diff --git a/lib/vfscore/syscalls.c b/lib/vfscore/syscalls.c
new file mode 100644
index 00000000..487d5729
--- /dev/null
+++ b/lib/vfscore/syscalls.c
@@ -0,0 +1,1486 @@
+/*
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * vfs_syscalls.c - everything in this file is a routine implementing
+ * a VFS system call.
+ */
+
+#include <sys/stat.h>
+#include <dirent.h>
+
+#include <limits.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include <osv/prex.h>
+#include <osv/vnode.h>
+#include <osv/vfs_file.hh>
+#include "vfs.h"
+#include <fs/fs.hh>
+
+extern struct task *main_task;
+
+static int
+open_no_follow_chk(char *path)
+{
+ int error;
+ struct dentry *ddp;
+ char *name;
+ struct dentry *dp;
+ struct vnode *vp;
+
+ ddp = nullptr;
+ dp = nullptr;
+ vp = nullptr;
+
+ error = lookup(path, &ddp, &name);
+ if (error) {
+ return (error);
+ }
+
+ error = namei_last_nofollow(path, ddp, &dp);
+ if (error) {
+ goto out;
+ }
+
+ vp = dp->d_vnode;
+ vn_lock(vp);
+ if (vp->v_type == VLNK) {
+ error = ELOOP;
+ goto out;
+ }
+
+ error = 0;
+out:
+ if (vp != nullptr) {
+ vn_unlock(vp);
+ }
+
+ if (dp != nullptr) {
+ drele(dp);
+ }
+
+ if (ddp != nullptr) {
+ drele(ddp);
+ }
+
+ return (error);
+}
+
+int
+sys_open(char *path, int flags, mode_t mode, struct file **fpp)
+{
+ file *fp;
+ struct dentry *dp, *ddp;
+ struct vnode *vp;
+ char *filename;
+ int error;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_open: path=%s flags=%x mode=%x\n",
+ path, flags, mode));
+
+ flags = fflags(flags);
+ if (flags & O_CREAT) {
+ error = namei(path, &dp);
+ if (error == ENOENT) {
+ /* Create new file. */
+ if ((error = lookup(path, &ddp, &filename)) != 0)
+ return error;
+
+ vn_lock(ddp->d_vnode);
+ if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0) {
+ vn_unlock(ddp->d_vnode);
+ drele(ddp);
+ return error;
+ }
+ mode &= ~S_IFMT;
+ mode |= S_IFREG;
+ error = VOP_CREATE(ddp->d_vnode, filename, mode);
+ vn_unlock(ddp->d_vnode);
+ drele(ddp);
+
+ if (error)
+ return error;
+ if ((error = namei(path, &dp)) != 0)
+ return error;
+
+ vp = dp->d_vnode;
+ flags &= ~O_TRUNC;
+ } else if (error) {
+ return error;
+ } else {
+ /* File already exits */
+ if (flags & O_EXCL) {
+ error = EEXIST;
+ goto out_drele;
+ }
+ }
+
+ vp = dp->d_vnode;
+ flags &= ~O_CREAT;
+ } else {
+ /* Open */
+ if (flags & O_NOFOLLOW) {
+ error = open_no_follow_chk(path);
+ if (error != 0) {
+ return (error);
+ }
+ }
+ error = namei(path, &dp);
+ if (error)
+ return error;
+
+ vp = dp->d_vnode;
+
+ if (flags & FWRITE || flags & O_TRUNC) {
+ error = vn_access(vp, VWRITE);
+ if (error)
+ goto out_drele;
+
+ error = EISDIR;
+ if (vp->v_type == VDIR)
+ goto out_drele;
+ }
+ if (flags & O_DIRECTORY) {
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out_drele;
+ }
+ }
+ }
+
+ vn_lock(vp);
+ /* Process truncate request */
+ if (flags & O_TRUNC) {
+ error = EINVAL;
+ if (!(flags & FWRITE) || vp->v_type == VDIR)
+ goto out_vn_unlock;
+
+ error = VOP_TRUNCATE(vp, 0);
+ if (error)
+ goto out_vn_unlock;
+ }
+
+ try {
+ fileref f = make_file<vfs_file>(flags);
+ fp = f.get();
+ fhold(fp);
+ } catch (int err) {
+ error = err;
+ goto out_vn_unlock;
+ }
+ // change to std::move once dp is a dentry_ref
+ fp->f_dentry = dentry_ref(dp, false);
+ dp = nullptr;
+
+ error = VOP_OPEN(vp, fp);
+ if (error) {
+ vn_unlock(vp);
+ // Note direct delete of fp instead of fdrop(fp). fp was never
+ // returned so cannot be in use, and because it wasn't opened
+ // it cannot be close()ed.
+ delete fp;
+ return error;
+ }
+ vn_unlock(vp);
+
+ *fpp = fp;
+ return 0;
+
+out_vn_unlock:
+ vn_unlock(vp);
+out_drele:
+ if (dp) {
+ drele(dp);
+ }
+ return error;
+}
+
+int
+sys_close(struct file *fp)
+{
+
+ return 0;
+}
+
+int
+sys_read(struct file *fp, const struct iovec *iov, size_t niov,
+ off_t offset, size_t *count)
+{
+ if ((fp->f_flags & FREAD) == 0)
+ return EBADF;
+
+ size_t bytes = 0;
+ auto iovp = iov;
+ for (unsigned i = 0; i < niov; i++) {
+ if (iovp->iov_len > IOSIZE_MAX - bytes) {
+ return EINVAL;
+ }
+ bytes += iovp->iov_len;
+ iovp++;
+ }
+
+ if (bytes == 0) {
+ *count = 0;
+ return 0;
+ }
+
+ struct uio uio;
+ // Unfortunately, the current implementation of fp->read zeros the
+ // iov_len fields when it reads from disk, so we have to copy iov.
+ std::vector<iovec> copy_iov(iov, iov + niov);
+ uio.uio_iov = copy_iov.data();
+ uio.uio_iovcnt = niov;
+ uio.uio_offset = offset;
+ uio.uio_resid = bytes;
+ uio.uio_rw = UIO_READ;
+ auto error = fp->read(&uio, (offset == -1) ? 0 : FOF_OFFSET);
+ *count = bytes - uio.uio_resid;
+ return error;
+}
+
+int
+sys_write(struct file *fp, const struct iovec *iov, size_t niov,
+ off_t offset, size_t *count)
+{
+ if ((fp->f_flags & FWRITE) == 0)
+ return EBADF;
+
+ size_t bytes = 0;
+ auto iovp = iov;
+ for (unsigned i = 0; i < niov; i++) {
+ if (iovp->iov_len > IOSIZE_MAX - bytes) {
+ return EINVAL;
+ }
+ bytes += iovp->iov_len;
+ iovp++;
+ }
+
+ if (bytes == 0) {
+ *count = 0;
+ return 0;
+ }
+
+ struct uio uio;
+ // Unfortunately, the current implementation of fp->write zeros the
+ // iov_len fields when it writes to disk, so we have to copy iov.
+ std::vector<iovec> copy_iov(iov, iov + niov);
+ uio.uio_iov = copy_iov.data();
+ uio.uio_iovcnt = niov;
+ uio.uio_offset = offset;
+ uio.uio_resid = bytes;
+ uio.uio_rw = UIO_WRITE;
+ auto error = fp->write(&uio, (offset == -1) ? 0 : FOF_OFFSET);
+ *count = bytes - uio.uio_resid;
+ return error;
+}
+
+int
+sys_lseek(struct file *fp, off_t off, int type, off_t *origin)
+{
+ struct vnode *vp;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_seek: fp=%x off=%d type=%d\n",
+ (u_long)fp, (u_int)off, type));
+
+ if (!fp->f_dentry) {
+ // Linux doesn't implement lseek() on pipes, sockets, or ttys.
+ // In OSV, we only implement lseek() on regular files, backed by
vnode
+ return ESPIPE;
+ }
+
+ vp = fp->f_dentry->d_vnode;
+ int error = EINVAL;
+ vn_lock(vp);
+ switch (type) {
+ case SEEK_CUR:
+ off = fp->f_offset + off;
+ break;
+ case SEEK_END:
+ off = vp->v_size + off;
+ break;
+ }
+ if (off >= 0) {
+ error = VOP_SEEK(vp, fp, fp->f_offset, off);
+ if (!error) {
+ *origin = off;
+ fp->f_offset = off;
+ }
+ }
+ vn_unlock(vp);
+ return error;
+}
+
+int
+sys_ioctl(struct file *fp, u_long request, void *buf)
+{
+ int error;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_ioctl: fp=%x request=%x\n", fp, request));
+
+ if ((fp->f_flags & (FREAD | FWRITE)) == 0)
+ return EBADF;
+
+ error = fp->ioctl(request, buf);
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_ioctl: comp error=%d\n", error));
+ return error;
+}
+
+int
+sys_fsync(struct file *fp)
+{
+ struct vnode *vp;
+ int error;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_fsync: fp=%x\n", fp));
+
+ if (!fp->f_dentry)
+ return EINVAL;
+
+ vp = fp->f_dentry->d_vnode;
+ vn_lock(vp);
+ error = VOP_FSYNC(vp, fp);
+ vn_unlock(vp);
+ return error;
+}
+
+int
+sys_fstat(struct file *fp, struct stat *st)
+{
+ int error = 0;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_fstat: fp=%x\n", fp));
+
+ error = fp->stat(st);
+
+ return error;
+}
+
+/*
+ * Return 0 if directory is empty
+ */
+static int
+check_dir_empty(char *path)
+{
+ int error;
+ struct file *fp;
+ struct dirent dir;
+
+ DPRINTF(VFSDB_SYSCALL, ("check_dir_empty\n"));
+
+ error = sys_open(path, O_RDONLY, 0, &fp);
+ if (error)
+ goto out_error;
+
+ do {
+ error = sys_readdir(fp, &dir);
+ if (error != 0 && error != EACCES)
+ break;
+ } while (!strcmp(dir.d_name, ".") || !strcmp(dir.d_name, ".."));
+
+ if (error == ENOENT)
+ error = 0;
+ else if (error == 0) {
+ // Posix specifies to return EEXIST in this case (rmdir of non-empty
+ // directory, but Linux actually returns ENOTEMPTY).
+ error = ENOTEMPTY;
+ }
+ fdrop(fp);
+out_error:
+ return error;
+}
+
+int
+sys_readdir(struct file *fp, struct dirent *dir)
+{
+ struct vnode *dvp;
+ int error;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_readdir: fp=%x\n", fp));
+
+ if (!fp->f_dentry)
+ return ENOTDIR;
+
+ dvp = fp->f_dentry->d_vnode;
+ vn_lock(dvp);
+ if (dvp->v_type != VDIR) {
+ vn_unlock(dvp);
+ return ENOTDIR;
+ }
+ error = VOP_READDIR(dvp, fp, dir);
+ DPRINTF(VFSDB_SYSCALL, ("sys_readdir: error=%d path=%s\n",
+ error, dir->d_name));
+ vn_unlock(dvp);
+ return error;
+}
+
+int
+sys_rewinddir(struct file *fp)
+{
+ struct vnode *dvp;
+
+ if (!fp->f_dentry)
+ return ENOTDIR;
+
+ dvp = fp->f_dentry->d_vnode;
+ vn_lock(dvp);
+ if (dvp->v_type != VDIR) {
+ vn_unlock(dvp);
+ return EBADF;
+ }
+ fp->f_offset = 0;
+ vn_unlock(dvp);
+ return 0;
+}
+
+int
+sys_seekdir(struct file *fp, long loc)
+{
+ struct vnode *dvp;
+
+ if (!fp->f_dentry)
+ return ENOTDIR;
+
+ dvp = fp->f_dentry->d_vnode;
+ vn_lock(dvp);
+ if (dvp->v_type != VDIR) {
+ vn_unlock(dvp);
+ return EBADF;
+ }
+ fp->f_offset = (off_t)loc;
+ vn_unlock(dvp);
+ return 0;
+}
+
+int
+sys_telldir(struct file *fp, long *loc)
+{
+ struct vnode *dvp;
+
+ if (!fp->f_dentry)
+ return ENOTDIR;
+
+ dvp = fp->f_dentry->d_vnode;
+ vn_lock(dvp);
+ if (dvp->v_type != VDIR) {
+ vn_unlock(dvp);
+ return EBADF;
+ }
+ *loc = (long)fp->f_offset;
+ vn_unlock(dvp);
+ return 0;
+}
+
+int
+sys_mkdir(char *path, mode_t mode)
+{
+ char *name;
+ struct dentry *dp, *ddp;
+ int error;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_mkdir: path=%s mode=%d\n", path, mode));
+
+ error = namei(path, &dp);
+ if (!error) {
+ /* File already exists */
+ drele(dp);
+ return EEXIST;
+ }
+
+ if ((error = lookup(path, &ddp, &name)) != 0) {
+ /* Directory already exists */
+ return error;
+ }
+
+ vn_lock(ddp->d_vnode);
+ if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0)
+ goto out;
+ mode &= ~S_IFMT;
+ mode |= S_IFDIR;
+
+ error = VOP_MKDIR(ddp->d_vnode, name, mode);
+ out:
+ vn_unlock(ddp->d_vnode);
+ drele(ddp);
+ return error;
+}
+
+int
+sys_rmdir(char *path)
+{
+ struct dentry *dp, *ddp;
+ struct vnode *vp;
+ int error;
+ char *name;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_rmdir: path=%s\n", path));
+
+ if ((error = check_dir_empty(path)) != 0)
+ return error;
+ error = namei(path, &dp);
+ if (error)
+ return error;
+
+ vp = dp->d_vnode;
+ vn_lock(vp);
+ if ((error = vn_access(vp, VWRITE)) != 0)
+ goto out;
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ if (vp->v_flags & VROOT || vp->v_refcnt >= 2) {
+ error = EBUSY;
+ goto out;
+ }
+ if ((error = lookup(path, &ddp, &name)) != 0)
+ goto out;
+
+ vn_lock(ddp->d_vnode);
+ error = VOP_RMDIR(ddp->d_vnode, vp, name);
+ vn_unlock(ddp->d_vnode);
+
+ vn_unlock(vp);
+ dentry_remove(dp);
+ drele(ddp);
+ drele(dp);
+ return error;
+
+ out:
+ vn_unlock(vp);
+ drele(dp);
+ return error;
+}
+
+int
+sys_mknod(char *path, mode_t mode)
+{
+ char *name;
+ struct dentry *dp, *ddp;
+ int error;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_mknod: path=%s mode=%d\n", path, mode));
+
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFIFO:
+ case S_IFSOCK:
+ /* OK */
+ break;
+ default:
+ return EINVAL;
+ }
+
+ error = namei(path, &dp);
+ if (!error) {
+ drele(dp);
+ return EEXIST;
+ }
+
+ if ((error = lookup(path, &ddp, &name)) != 0)
+ return error;
+
+ vn_lock(ddp->d_vnode);
+ if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0)
+ goto out;
+ if (S_ISDIR(mode))
+ error = VOP_MKDIR(ddp->d_vnode, name, mode);
+ else
+ error = VOP_CREATE(ddp->d_vnode, name, mode);
+ out:
+ vn_unlock(ddp->d_vnode);
+ drele(ddp);
+ return error;
+}
+
+/*
+ * Returns true when @parent path could represent parent directory
+ * of a file or directory represented by @child path.
+ *
+ * Assumes both paths do not have trailing slashes.
+ */
+static bool
+is_parent(const char *parent, const char *child)
+{
+ size_t p_len = strlen(parent);
+ return !strncmp(parent, child, p_len) && (parent[p_len-1] == '/' ||
child[p_len] == '/');
+}
+
+static bool
+has_trailing(const char *path, char ch)
+{
+ size_t len = strlen(path);
+ return len && path[len - 1] == ch;
+}
+
+static void
+strip_trailing(char *path, char ch)
+{
+ size_t len = strlen(path);
+
+ while (len && path[len - 1] == ch)
+ len--;
+
+ path[len] = '\0';
+}
+
+int
+sys_rename(char *src, char *dest)
+{
+ struct dentry *dp1, *dp2 = 0, *ddp1, *ddp2;
+ struct vnode *vp1, *vp2 = 0, *dvp1, *dvp2;
+ char *sname, *dname;
+ int error;
+ char root[] = "/";
+ bool ts; /* trailing slash */
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_rename: src=%s dest=%s\n", src, dest));
+
+ ts = false;
+ if (has_trailing(src, '/') == true) {
+ if (strlen(src) != 1) {
+ /* remove trailing slash iff path is none root */
+ strip_trailing(src, '/');
+ ts = true;
+ }
+ }
+
+ error = lookup(src, &ddp1, &sname);
+ if (error != 0) {
+ return (error);
+ }
+
+ error = namei_last_nofollow(src, ddp1, &dp1);
+ if (error != 0) {
+ drele(ddp1);
+ return (error);
+ }
+
+ vp1 = dp1->d_vnode;
+ vn_lock(vp1);
+
+ if (vp1->v_type != VDIR && ts == true) {
+ error = ENOTDIR;
+ goto err1;
+ }
+
+ ts = false;
+ if (has_trailing(dest, '/') == true) {
+ if (strlen(dest) != 1) {
+ /* remove trailing slash iff path is none root */
+ strip_trailing(dest, '/');
+ ts = true;
+ }
+ }
+
+ error = lookup(dest, &ddp2, &dname);
+ if (error != 0) {
+ goto err1;
+ }
+
+ error = namei_last_nofollow(dest, ddp2, &dp2);
+ if (error == 0) {
+ /* target exists */
+
+ vp2 = dp2->d_vnode;
+ vn_lock(vp2);
+
+ if (vp2->v_type != VDIR && vp2->v_type != VLNK) {
+ if (vp1->v_type == VDIR || ts == true) {
+ error = ENOTDIR;
+ goto err2;
+ }
+ } else if (vp1->v_type != VDIR && vp2->v_type == VDIR) {
+ error = EISDIR;
+ goto err2;
+ }
+ if (vp2->v_type == VDIR && check_dir_empty(dest)) {
+ error = EEXIST;
+ goto err2;
+ }
+ } else if (error == ENOENT) {
+ if (vp1->v_type != VDIR && ts == true) {
+ error = ENOTDIR;
+ goto err2;
+ }
+ } else {
+ goto err2;
+ }
+
+ if (strcmp(dest, "/"))
+ strip_trailing(dest, '/');
+
+ if (strcmp(src, "/"))
+ strip_trailing(src, '/');
+
+ /* If source and dest are the same, do nothing */
+ if (!strncmp(src, dest, PATH_MAX))
+ goto err2;
+
+ /* Check if target is directory of source */
+ if (is_parent(src, dest)) {
+ error = EINVAL;
+ goto err2;
+ }
+
+ dname = strrchr(dest, '/');
+ if (dname == nullptr) {
+ error = ENOTDIR;
+ goto err2;
+ }
+ if (dname == dest)
+ dest = root;
+
+ *dname = 0;
+ dname++;
+
+ dvp1 = ddp1->d_vnode;
+ vn_lock(dvp1);
+
+ dvp2 = ddp2->d_vnode;
+ vn_lock(dvp2);
+
+ /* Source and destination directions should be writable) */
+ if ((error = vn_access(dvp1, VWRITE)) != 0)
+ goto err3;
+ if ((error = vn_access(dvp2, VWRITE)) != 0)
+ goto err3;
+
+ /* The source and dest must be same file system */
+ if (dvp1->v_mount != dvp2->v_mount) {
+ error = EXDEV;
+ goto err3;
+ }
+
+ error = VOP_RENAME(dvp1, vp1, sname, dvp2, vp2, dname);
+
+ dentry_move(dp1, ddp2, dname);
+ if (dp2)
+ dentry_remove(dp2);
+
+ err3:
+ vn_unlock(dvp2);
+ vn_unlock(dvp1);
+ err2:
+ if (vp2) {
+ vn_unlock(vp2);
+ drele(dp2);
+ }
+ drele(ddp2);
+ err1:
+ vn_unlock(vp1);
+ drele(dp1);
+ drele(ddp1);
+ return error;
+}
+
+int
+sys_symlink(const char *oldpath, const char *newpath)
+{
+ struct task *t = main_task;
+ int error;
+ std::unique_ptr<char []> up_op (new char[PATH_MAX]);
+ char *op = up_op.get();
+ std::unique_ptr<char []> up_np (new char[PATH_MAX]);
+ char *np = up_np.get();
+ struct dentry *newdp;
+ struct dentry *newdirdp;
+ char *name;
+
+ if (oldpath == nullptr || newpath == nullptr) {
+ return (EFAULT);
+ }
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_link: oldpath=%s newpath=%s\n",
+ oldpath, newpath));
+
+ newdp = nullptr;
+ newdirdp = nullptr;
+
+ error = task_conv(t, newpath, VWRITE, np);
+ if (error != 0) {
+ return (error);
+ }
+
+ /* parent directory for new path must exist */
+ if ((error = lookup(np, &newdirdp, &name)) != 0) {
+ error = ENOENT;
+ goto out;
+ }
+ vn_lock(newdirdp->d_vnode);
+
+ /* newpath should not already exist */
+ if (namei_last_nofollow(np, newdirdp, &newdp) == 0) {
+ drele(newdp);
+ error = EEXIST;
+ goto out;
+ }
+
+ /* check for write access at newpath */
+ if ((error = vn_access(newdirdp->d_vnode, VWRITE)) != 0) {
+ goto out;
+ }
+
+ /* oldpath may not be const char * to VOP_SYMLINK - need to copy */
+ size_t tocopy;
+ tocopy = strlcpy(op, oldpath, PATH_MAX);
+ if (tocopy >= PATH_MAX - 1) {
+ error = ENAMETOOLONG;
+ goto out;
+ }
+ error = VOP_SYMLINK(newdirdp->d_vnode, name, op);
+
+out:
+ if (newdirdp != nullptr) {
+ vn_unlock(newdirdp->d_vnode);
+ drele(newdirdp);
+ }
+
+ return (error);
+}
+
+int
+sys_link(char *oldpath, char *newpath)
+{
+ struct dentry *olddp, *newdp, *newdirdp;
+ struct vnode *vp;
+ char *name;
+ int error;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_link: oldpath=%s newpath=%s\n",
+ oldpath, newpath));
+
+ /* File from oldpath must exist */
+ if ((error = namei(oldpath, &olddp)) != 0)
+ return error;
+
+ vp = olddp->d_vnode;
+ vn_lock(vp);
+
+ if (vp->v_type == VDIR) {
+ error = EPERM;
+ goto out;
+ }
+
+ /* If newpath exists, it shouldn't be overwritten */
+ if (!namei(newpath, &newdp)) {
+ error = EEXIST;
+ goto out;
+ }
+
+ /* Get pointer to the parent dentry of newpath */
+ if ((error = lookup(newpath, &newdirdp, &name)) != 0)
+ goto out;
+
+ vn_lock(newdirdp->d_vnode);
+
+ /* Both files must reside on the same mounted file system */
+ if (olddp->d_mount != newdirdp->d_mount) {
+ error = EXDEV;
+ goto out1;
+ }
+
+ /* Write access to the dir containing newpath is required */
+ if ((error = vn_access(newdirdp->d_vnode, VWRITE)) != 0)
+ goto out1;
+
+ /* Map newpath into dentry hash with the same vnode as oldpath */
+ if (!(newdp = dentry_alloc(newdirdp, vp, newpath))) {
+ error = ENOMEM;
+ goto out1;
+ }
+
+ error = VOP_LINK(newdirdp->d_vnode, vp, name);
+ out1:
+ vn_unlock(newdirdp->d_vnode);
+ drele(newdirdp);
+ out:
+ vn_unlock(vp);
+ drele(olddp);
+ drele(newdp);
+ return error;
+}
+
+int
+sys_unlink(char *path)
+{
+ char *name;
+ struct dentry *dp, *ddp;
+ struct vnode *vp;
+ int error;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_unlink: path=%s\n", path));
+
+ ddp = nullptr;
+ dp = nullptr;
+ vp = nullptr;
+
+ error = lookup(path, &ddp, &name);
+ if (error != 0) {
+ return (error);
+ }
+
+ error = namei_last_nofollow(path, ddp, &dp);
+ if (error != 0) {
+ goto out;
+ }
+
+ vp = dp->d_vnode;
+ vn_lock(vp);
+ if (vp->v_type == VDIR) {
+ // Posix specifies that we should return EPERM here, but Linux
+ // actually returns EISDIR.
+ error = EISDIR;
+ goto out;
+ }
+ if (vp->v_flags & VROOT) {
+ error = EBUSY;
+ goto out;
+ }
+
+ vn_lock(ddp->d_vnode);
+ if ((error = vn_access(ddp->d_vnode, VWRITE)) != 0) {
+ vn_unlock(ddp->d_vnode);
+ goto out;
+ }
+ error = VOP_REMOVE(ddp->d_vnode, vp, name);
+ vn_unlock(ddp->d_vnode);
+
+ vn_unlock(vp);
+ dentry_remove(dp);
+ drele(ddp);
+ drele(dp);
+ return error;
+ out:
+ if (vp != nullptr) {
+ vn_unlock(vp);
+ }
+
+ if (dp != nullptr) {
+ drele(dp);
+ }
+
+ if (ddp != nullptr) {
+ drele(ddp);
+ }
+ return error;
+}
+
+int
+sys_access(char *path, int mode)
+{
+ struct dentry *dp;
+ int error, flags;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_access: path=%s mode=%x\n", path, mode));
+
+ /* If F_OK is set, we return here if file is not found. */
+ error = namei(path, &dp);
+ if (error)
+ return error;
+
+ flags = 0;
+ if (mode & R_OK)
+ flags |= VREAD;
+ if (mode & W_OK)
+ flags |= VWRITE;
+ if (mode & X_OK)
+ flags |= VEXEC;
+
+ error = vn_access(dp->d_vnode, flags);
+
+ drele(dp);
+ return error;
+}
+
+int
+sys_stat(char *path, struct stat *st)
+{
+ DPRINTF(VFSDB_SYSCALL, ("sys_stat: path=%s\n", path));
+
+ try {
+ dentry_ref dp = namei(path);
+ if (!dp) {
+ return ENOENT;
+ }
+ return vn_stat(dp->d_vnode, st);
+ } catch (error e) {
+ return e.get();
+ }
+}
+
+int sys_lstat(char *path, struct stat *st)
+{
+ int error;
+ struct dentry *ddp;
+ char *name;
+ struct dentry *dp;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_lstat: path=%s\n", path));
+
+ error = lookup(path, &ddp, &name);
+ if (error) {
+ return (error);
+ }
+
+ error = namei_last_nofollow(path, ddp, &dp);
+ if (error) {
+ drele(ddp);
+ return error;
+ }
+
+ error = vn_stat(dp->d_vnode, st);
+ drele(dp);
+ drele(ddp);
+ return error;
+}
+
+int
+sys_statfs(char *path, struct statfs *buf)
+{
+ memset(buf, 0, sizeof(*buf));
+ try {
+ dentry_ref dp = namei(path);
+ if (!dp) {
+ return ENOENT;
+ }
+ return VFS_STATFS(dp->d_mount, buf);
+ } catch (error e) {
+ return e.get();
+ }
+}
+
+int
+sys_fstatfs(struct file *fp, struct statfs *buf)
+{
+ struct vnode *vp;
+ int error = 0;
+
+ if (!fp->f_dentry)
+ return EBADF;
+
+ vp = fp->f_dentry->d_vnode;
+ memset(buf, 0, sizeof(*buf));
+
+ vn_lock(vp);
+ error = VFS_STATFS(vp->v_mount, buf);
+ vn_unlock(vp);
+
+ return error;
+}
+
+int
+sys_truncate(char *path, off_t length)
+{
+ struct dentry *dp;
+ int error;
+
+ error = namei(path, &dp);
+ if (error)
+ return error;
+
+ vn_lock(dp->d_vnode);
+ error = VOP_TRUNCATE(dp->d_vnode, length);
+ vn_unlock(dp->d_vnode);
+
+ drele(dp);
+ return error;
+}
+
+int
+sys_ftruncate(struct file *fp, off_t length)
+{
+ struct vnode *vp;
+ int error;
+
+ if (!fp->f_dentry)
+ return EBADF;
+
+ vp = fp->f_dentry->d_vnode;
+ vn_lock(vp);
+ error = VOP_TRUNCATE(vp, length);
+ vn_unlock(vp);
+
+ return error;
+}
+
+int
+sys_fchdir(struct file *fp, char *cwd)
+{
+ struct vnode *dvp;
+
+ if (!fp->f_dentry)
+ return EBADF;
+
+ dvp = fp->f_dentry->d_vnode;
+ vn_lock(dvp);
+ if (dvp->v_type != VDIR) {
+ vn_unlock(dvp);
+ return EBADF;
+ }
+ strlcpy(cwd, fp->f_dentry->d_path, PATH_MAX);
+ vn_unlock(dvp);
+ return 0;
+}
+
+int
+sys_readlink(char *path, char *buf, size_t bufsize, ssize_t *size)
+{
+ int error;
+ struct dentry *ddp;
+ char *name;
+ struct dentry *dp;
+ struct vnode *vp;
+ struct iovec vec;
+ struct uio uio;
+
+ *size = 0;
+ error = lookup(path, &ddp, &name);
+ if (error) {
+ return (error);
+ }
+
+ error = namei_last_nofollow(path, ddp, &dp);
+ if (error) {
+ drele(ddp);
+ return (error);
+ }
+
+ if (dp->d_vnode->v_type != VLNK) {
+ drele(dp);
+ drele(ddp);
+ return (EINVAL);
+ }
+ vec.iov_base = buf;
+ vec.iov_len = bufsize;
+
+ uio.uio_iov = &vec;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = 0;
+ uio.uio_resid = bufsize;
+ uio.uio_rw = UIO_READ;
+
+ vp = dp->d_vnode;
+ vn_lock(vp);
+ error = VOP_READLINK(vp, &uio);
+ vn_unlock(vp);
+
+ drele(dp);
+ drele(ddp);
+
+ if (error) {
+ return (error);
+ }
+
+ *size = bufsize - uio.uio_resid;
+ return (0);
+}
+
+/*
+ * Check the validity of the members of a struct timeval.
+ */
+static bool is_timeval_valid(const struct timeval *time)
+{
+ return (time->tv_sec >= 0) &&
+ (time->tv_usec >= 0 && time->tv_usec < 1000000);
+}
+
+/*
+ * Convert a timeval struct to a timespec one.
+ */
+static void convert_timeval(struct timespec &to, const struct timeval *from)
+{
+ if (from) {
+ to.tv_sec = from->tv_sec;
+ to.tv_nsec = from->tv_usec * 1000; // Convert microseconds to
nanoseconds
+ } else {
+ clock_gettime(CLOCK_REALTIME, &to);
+ }
+}
+
+int
+sys_utimes(char *path, const struct timeval times[2], int flags)
+{
+ int error;
+ struct dentry *dp;
+ struct timespec timespec_times[2];
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_utimes: path=%s\n", path));
+
+ if (times && (!is_timeval_valid(×[0]) ||
!is_timeval_valid(×[1])))
+ return EINVAL;
+
+ // Convert each element of timeval array to the timespec type
+ convert_timeval(timespec_times[0], times ? times + 0 : nullptr);
+ convert_timeval(timespec_times[1], times ? times + 1 : nullptr);
+
+ if (flags & AT_SYMLINK_NOFOLLOW) {
+ struct dentry *ddp;
+ error = lookup(path, &ddp, nullptr);
+ if (error) {
+ return error;
+ }
+
+ error = namei_last_nofollow(path, ddp, &dp);
+ if (ddp != nullptr) {
+ drele(ddp);
+ }
+ if (error) {
+ return error;
+ }
+ } else {
+ error = namei(path, &dp);
+ if (error)
+ return error;
+ }
+
+ if (dp->d_mount->m_flags & MNT_RDONLY) {
+ error = EROFS;
+ } else {
+ error = vn_settimes(dp->d_vnode, timespec_times);
+ }
+
+ drele(dp);
+ return error;
+}
+
+/*
+ * Check the validity of members of a struct timespec
+ */
+static bool is_timespec_valid(const struct timespec &time)
+{
+ return (time.tv_sec >= 0) &&
+ ((time.tv_nsec >= 0 && time.tv_nsec <= 999999999) ||
+ time.tv_nsec == UTIME_NOW ||
+ time.tv_nsec == UTIME_OMIT);
+}
+
+void init_timespec(struct timespec &_times, const struct timespec *times)
+{
+ if (times == nullptr || times->tv_nsec == UTIME_NOW) {
+ clock_gettime(CLOCK_REALTIME, &_times);
+ } else {
+ _times.tv_sec = times->tv_sec;
+ _times.tv_nsec = times->tv_nsec;
+ }
+ return;
+}
+
+int
+sys_utimensat(int dirfd, const char *pathname, const struct timespec times[2],
int flags)
+{
+ int error;
+ std::string ap;
+ struct timespec timespec_times[2];
+ extern struct task *main_task;
+ struct dentry *dp;
+
+ /* utimensat should return ENOENT when pathname is empty */
+ if(pathname && pathname[0] == 0)
+ return ENOENT;
+
+ if (flags && !(flags & AT_SYMLINK_NOFOLLOW))
+ return EINVAL;
+
+ if (times && (!is_timespec_valid(times[0]) ||
!is_timespec_valid(times[1])))
+ return EINVAL;
+
+ init_timespec(timespec_times[0], times ? times + 0 : nullptr);
+ init_timespec(timespec_times[1], times ? times + 1 : nullptr);
+
+ if (pathname && pathname[0] == '/') {
+ ap = pathname;
+ } else if (dirfd == AT_FDCWD) {
+ if (!pathname)
+ return EFAULT;
+ ap = std::string(main_task->t_cwd) + "/" + pathname;
+ } else {
+ struct file *fp;
+ fileref f(fileref_from_fd(dirfd));
+
+ if (!f)
+ return EBADF;
+
+ fp = f.get();
+
+ if(!fp->f_dentry)
+ return EBADF;
+
+ if (!(fp->f_dentry->d_vnode->v_type & VDIR))
+ return ENOTDIR;
+
+ if (pathname)
+ ap = std::string(fp->f_dentry->d_path) + "/" + pathname;
+ else
+ ap = fp->f_dentry->d_path;
+
+ ap = std::string(fp->f_dentry->d_mount->m_path) + "/" + ap;
+ }
+
+ /* FIXME: Add support for AT_SYMLINK_NOFOLLOW */
+
+ error = namei(ap.c_str(), &dp);
+
+ if (error)
+ return error;
+
+ if (dp->d_mount->m_flags & MNT_RDONLY) {
+ error = EROFS;
+ } else {
+ if (vn_access(dp->d_vnode, VWRITE)) {
+ return EACCES;
+ }
+ if (times &&
+ (times[0].tv_nsec != UTIME_NOW || times[1].tv_nsec !=
UTIME_NOW) &&
+ (times[0].tv_nsec != UTIME_OMIT || times[1].tv_nsec !=
UTIME_OMIT) &&
+ (!(dp->d_vnode->v_mode & ~VAPPEND)))
+ return EPERM;
+ error = vn_settimes(dp->d_vnode, timespec_times);
+ }
+
+ drele(dp);
+ return error;
+}
+
+int
+sys_futimens(int fd, const struct timespec times[2])
+{
+ struct file *fp;
+
+ fileref f(fileref_from_fd(fd));
+ if (!f)
+ return EBADF;
+
+ fp = f.get();
+
+ if (!fp->f_dentry)
+ return EBADF;
+
+ std::string pathname = fp->f_dentry->d_path;
+ auto error = sys_utimensat(AT_FDCWD, pathname.c_str(), times, 0);
+ return error;
+}
+
+int
+sys_fallocate(struct file *fp, int mode, loff_t offset, loff_t len)
+{
+ int error;
+ struct vnode *vp;
+
+ DPRINTF(VFSDB_SYSCALL, ("sys_fallocate: fp=%x", fp));
+
+ if (!fp->f_dentry || !(fp->f_flags & FWRITE)) {
+ return EBADF;
+ }
+
+ if (offset < 0 || len <= 0) {
+ return EINVAL;
+ }
+
+ // Strange, but that's what Linux returns.
+ if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE)) {
+ return ENOTSUP;
+ }
+
+ vp = fp->f_dentry->d_vnode;
+ vn_lock(vp);
+
+ // NOTE: It's not detected here whether or not the device underlying
+ // the fs is a block device. It's up to the fs itself tell us whether
+ // or not fallocate is supported. See below:
+ if (vp->v_type != VREG && vp->v_type != VDIR) {
+ error = ENODEV;
+ goto ret;
+ }
+
+ // EOPNOTSUPP here means that the underlying file system
+ // referred by vp doesn't support fallocate.
+ if (!vp->v_op->vop_fallocate) {
+ error = EOPNOTSUPP;
+ goto ret;
+ }
+
+ error = VOP_FALLOCATE(vp, mode, offset, len);
+ret:
+ vn_unlock(vp);
+ return error;
+}
+
+int
+sys_chmod(const char *path, mode_t mode)
+{
+ int error;
+ struct dentry *dp;
+ DPRINTF(VFSDB_SYSCALL, ("sys_chmod: path=%s\n", path));
+ error = namei(path, &dp);
+ if (error)
+ return error;
+ if (dp->d_mount->m_flags & MNT_RDONLY) {
+ error = EROFS;
+ } else {
+ error = vn_setmode(dp->d_vnode, mode);
+ }
+ drele(dp);
+ return error;
+}
+
+int
+sys_fchmod(int fd, mode_t mode)
+{
+ fileref f(fileref_from_fd(fd));
+ if (!f)
+ return EBADF;
+ // Posix is ambivalent on what fchmod() should do on an fd that does not
+ // refer to a real file. It suggests an implementation may (but not must)
+ // fail EINVAL on a pipe, can behave in an "unspecified" manner on a
+ // socket, and for a STREAM, it must succeed and do nothing. Linux seems
+ // to just do the last thing (do nothing and succeed).
+ if (!f->f_dentry) {
+ return 0;
+ }
+ if (f->f_dentry->d_mount->m_flags & MNT_RDONLY) {
+ return EROFS;
+ } else {
+ return vn_setmode(f->f_dentry->d_vnode, mode);
+ }
+}
diff --git a/lib/vfscore/task.c b/lib/vfscore/task.c
new file mode 100644
index 00000000..7a355034
--- /dev/null
+++ b/lib/vfscore/task.c
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 2007, Kohsuke Ohtani All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * vfs_task.c - Routines to manage the per task data.
+ */
+
+
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+
+#include <osv/prex.h>
+#include "vfs.h"
+
+/*
+ * Allocate new task.
+ */
+int
+task_alloc(struct task **pt)
+{
+ struct task *t;
+
+ // FIXME: where do we free task ?
+ if (!(t = new task))
+ return ENOMEM;
+ memset(t, 0, sizeof(struct task));
+ strlcpy(t->t_cwd, "/", sizeof(t->t_cwd));
+
+ *pt = t;
+ return 0;
+}
+
+/*
+ * Convert to full path from the cwd of task and path.
+ * @wd: working directory
+ * @path: target path
+ * @full: full path to be returned
+ */
+int
+path_conv(char *wd, const char *cpath, char *full)
+{
+ char path[PATH_MAX];
+ char *src, *tgt, *p, *end;
+ size_t len = 0;
+
+ strlcpy(path, cpath, PATH_MAX);
+ path[PATH_MAX - 1] = '\0';
+
+ len = strlen(path);
+ if (len >= PATH_MAX)
+ return ENAMETOOLONG;
+ if (strlen(wd) + len >= PATH_MAX)
+ return ENAMETOOLONG;
+ src = path;
+ tgt = full;
+ end = src + len;
+ if (path[0] == '/') {
+ *tgt++ = *src++;
+ len = 1;
+ } else {
+ strlcpy(full, wd, PATH_MAX);
+ len = strlen(wd);
+ tgt += len;
+ if (len > 1 && path[0] != '.') {
+ *tgt = '/';
+ tgt++;
+ len++;
+ }
+ }
+ while (*src) {
+ p = src;
+ while (*p != '/' && *p != '\0')
+ p++;
+ *p = '\0';
+ if (!strcmp(src, "..")) {
+ if (len >= 2) {
+ len -= 2;
+ tgt -= 2; /* skip previous '/' */
+ while (*tgt != '/') {
+ tgt--;
+ len--;
+ }
+ if (len == 0) {
+ tgt++;
+ len++;
+ }
+ }
+ } else if (!strcmp(src, ".")) {
+ /* Ignore "." */
+ } else {
+ while (*src != '\0') {
+ *tgt++ = *src++;
+ len++;
+ }
+ }
+ if (p == end)
+ break;
+ if (len > 0 && *(tgt - 1) != '/') {
+ *tgt++ = '/';
+ len++;
+ }
+ src = p + 1;
+ }
+ *tgt = '\0';
+
+ return (0);
+}
+
+/*
+ * Convert to full path from the cwd of task and path.
+ * @t: task structure
+ * @path: target path
+ * @full: full path to be returned
+ * @acc: access mode
+ */
+int
+task_conv(struct task *t, const char *cpath, int acc, char *full)
+{
+ int rc;
+
+ rc = path_conv(t->t_cwd, cpath, full);
+ if (rc != 0) {
+ return (rc);
+ }
+
+ /* Check if the client task has required permission */
+ return (0); //sec_file_permission(t->t_taskid, full, acc);
+}
+
+/*
+ * Safe copying function that checks for overflow.
+ */
+int vfs_dname_copy(char *dest, const char *src, size_t size)
+{
+ if (strlcpy(dest, src, size) >= size) {
+ return -1;
+ }
+ return 0;
+}
diff --git a/lib/vfscore/vfs.h b/lib/vfscore/vfs.h
new file mode 100644
index 00000000..d86ef957
--- /dev/null
+++ b/lib/vfscore/vfs.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2005-2007, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _VFS_H
+#define _VFS_H
+
+#include <sys/cdefs.h>
+#include <assert.h>
+#include <dirent.h>
+#include <limits.h>
+
+#include <osv/prex.h>
+#include <osv/file.h>
+#include <osv/mount.h>
+#include <osv/vnode.h>
+#include <osv/dentry.h>
+#include <osv/error.h>
+
+/*
+ * Import vnode attributes flags
+ */
+#include <osv/vnode_attr.h>
+
+/* #define DEBUG_VFS 1 */
+
+/*
+ * Tunable parameters
+ */
+#define FSMAXNAMES 16 /* max length of 'file system' name */
+
+#ifdef DEBUG_VFS
+#include <osv/debug.h>
+
+extern int vfs_debug;
+
+#define VFSDB_CORE 0x00000001
+#define VFSDB_SYSCALL 0x00000002
+#define VFSDB_VNODE 0x00000004
+#define VFSDB_BIO 0x00000008
+#define VFSDB_CAP 0x00000010
+
+#define VFSDB_FLAGS 0x00000013
+
+#define DPRINTF(_m,X) if (vfs_debug & (_m)) kprintf X
+#else
+#define DPRINTF(_m, X)
+#endif
+
+#define ASSERT(e) assert(e)
+
+#define OPEN_MAX 256
+
+/*
+ * per task data
+ */
+struct task {
+ char t_cwd[PATH_MAX]; /* current working directory */
+ struct file *t_cwdfp; /* directory for cwd */
+};
+
+extern const struct vfssw vfssw[];
+
+__BEGIN_DECLS
+int sys_open(char *path, int flags, mode_t mode, struct file **fp);
+int sys_read(struct file *fp, const struct iovec *iov, size_t niov,
+ off_t offset, size_t *count);
+int sys_write(struct file *fp, const struct iovec *iov, size_t niov,
+ off_t offset, size_t *count);
+int sys_lseek(struct file *fp, off_t off, int type, off_t * cur_off);
+int sys_ioctl(struct file *fp, u_long request, void *buf);
+int sys_fstat(struct file *fp, struct stat *st);
+int sys_fstatfs(struct file *fp, struct statfs *buf);
+int sys_fsync(struct file *fp);
+int sys_ftruncate(struct file *fp, off_t length);
+
+int sys_readdir(struct file *fp, struct dirent *dirent);
+int sys_rewinddir(struct file *fp);
+int sys_seekdir(struct file *fp, long loc);
+int sys_telldir(struct file *fp, long *loc);
+int sys_fchdir(struct file *fp, char *path);
+
+int sys_mkdir(char *path, mode_t mode);
+int sys_rmdir(char *path);
+int sys_mknod(char *path, mode_t mode);
+int sys_rename(char *src, char *dest);
+int sys_link(char *oldpath, char *newpath);
+int sys_unlink(char *path);
+int sys_symlink(const char *oldpath, const char *newpath);
+int sys_access(char *path, int mode);
+int sys_stat(char *path, struct stat *st);
+int sys_lstat(char *path, struct stat *st);
+int sys_statfs(char *path, struct statfs *buf);
+int sys_truncate(char *path, off_t length);
+int sys_readlink(char *path, char *buf, size_t bufsize, ssize_t *size);
+int sys_utimes(char *path, const struct timeval times[2], int flags);
+int sys_utimensat(int dirfd, const char *pathname,
+ const struct timespec times[2], int flags);
+int sys_futimens(int fd, const struct timespec times[2]);
+int sys_fallocate(struct file *fp, int mode, loff_t offset, loff_t len);
+
+int sys_mount(const char *dev, const char *dir, const char *fsname, int
flags, const void *data);
+int sys_umount2(const char *path, int flags);
+int sys_umount(const char *path);
+int sys_pivot_root(const char *new_root, const char *old_put);
+int sys_sync(void);
+int sys_chmod(const char *path, mode_t mode);
+int sys_fchmod(int fd, mode_t mode);
+
+
+int task_alloc(struct task **pt);
+int task_conv(struct task *t, const char *path, int mode, char *full);
+int path_conv(char *wd, const char *cpath, char *full);
+
+//int sec_file_permission(task_t task, char *path, int mode);
+int sec_vnode_permission(char *path);
+
+int namei(const char *path, struct dentry **dpp);
+int namei_last_nofollow(char *path, struct dentry *ddp, struct dentry
**dp);
+int lookup(char *path, struct dentry **dpp, char **name);
+void vnode_init(void);
+void lookup_init(void);
+
+int vfs_findroot(const char *path, struct mount **mp, char **root);
+int vfs_dname_copy(char *dest, const char *src, size_t size);
+
+int fs_noop(void);
+
+struct dentry *dentry_alloc(struct dentry *parent_dp, struct vnode *vp, const
char *path);
+struct dentry *dentry_lookup(struct mount *mp, char *path);
+void dentry_move(struct dentry *dp, struct dentry *parent_dp, char *path);
+void dentry_remove(struct dentry *dp);
+void dref(struct dentry *dp);
+void drele(struct dentry *dp);
+void dentry_init(void);
+
+#ifdef DEBUG_VFS
+void vnode_dump(void);
+void mount_dump(void);
+#endif
+
+__END_DECLS
+
+#ifdef __cplusplus
+
+// Convert a path to a dentry_ref. Returns an empty
+// reference if not found (ENOENT) for efficiency, throws
+// an error on other errors.
+inline dentry_ref namei(char* path)
+{
+ dentry* dp;
+ auto err = namei(path, &dp);
+ if (err == ENOENT) {
+ return dentry_ref();
+ } else if (err) {
+ throw make_error(err);
+ } else {
+ return dentry_ref(dp, false);
+ }
+}
+
+#endif
+
+#endif /* !_VFS_H */
diff --git a/lib/vfscore/vnode.c b/lib/vfscore/vnode.c
new file mode 100644
index 00000000..a292344f
--- /dev/null
+++ b/lib/vfscore/vnode.c
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2005-2008, Kohsuke Ohtani
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * vfs_vnode.c - vnode service
+ */
+
+#include <limits.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/stat.h>
+
+#include <osv/prex.h>
+#include <osv/vnode.h>
+#include "vfs.h"
+
+enum vtype iftovt_tab[16] = {
+ VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+ VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[10] = {
+ 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+ S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
+};
+
+/*
+ * Memo:
+ *
+ * Function Ref count Lock
+ * ---------- --------- ----------
+ * vn_lock * Lock
+ * vn_unlock * Unlock
+ * vget 1 Lock
+ * vput -1 Unlock
+ * vref +1 *
+ * vrele -1 *
+ */
+
+#define VNODE_BUCKETS 32 /* size of vnode hash table */
+
+/*
+ * vnode table.
+ * All active (opened) vnodes are stored on this hash table.
+ * They can be accessed by its path name.
+ */
+static LIST_HEAD(vnode_hash_head, vnode) vnode_table[VNODE_BUCKETS];
+
+/*
+ * Global lock to access all vnodes and vnode table.
+ * If a vnode is already locked, there is no need to
+ * lock this global lock to access internal data.
+ */
+static mutex_t vnode_lock = MUTEX_INITIALIZER;
+#define VNODE_LOCK() mutex_lock(&vnode_lock)
+#define VNODE_UNLOCK() mutex_unlock(&vnode_lock)
+#define VNODE_OWNED() mutex_owned(&vnode_lock)
+
+/*
+ * Get the hash value from the mount point and path name.
+ * XXX(hch): replace with a better hash for 64-bit pointers.
+ */
+static u_int
+vn_hash(struct mount *mp, uint64_t ino)
+{
+ return (ino ^ (unsigned long)mp) & (VNODE_BUCKETS - 1);
+}
+
+/*
+ * Returns locked vnode for specified mount point and path.
+ * vn_lock() will increment the reference count of vnode.
+ *
+ * Locking: VNODE_LOCK must be held.
+ */
+struct vnode *
+vn_lookup(struct mount *mp, uint64_t ino)
+{
+ struct vnode *vp;
+
+ assert(VNODE_OWNED());
+ LIST_FOREACH(vp, &vnode_table[vn_hash(mp, ino)], v_link) {
+ if (vp->v_mount == mp && vp->v_ino == ino) {
+ vp->v_refcnt++;
+ mutex_lock(&vp->v_lock);
+ vp->v_nrlocks++;
+ return vp;
+ }
+ }
+ return nullptr; /* not found */
+}
+
+#ifdef DEBUG_VFS
+static const char *
+vn_path(struct vnode *vp)
+{
+ struct dentry *dp;
+
+ if (LIST_EMPTY(&vp->v_names) == 1) {
+ return (" ");
+ }
+ dp = LIST_FIRST(&vp->v_names);
+ return (dp->d_path);
+}
+#endif
+
+/*
+ * Lock vnode
+ */
+void
+vn_lock(struct vnode *vp)
+{
+ ASSERT(vp);
+ ASSERT(vp->v_refcnt > 0);
+
+ mutex_lock(&vp->v_lock);
+ vp->v_nrlocks++;
+ DPRINTF(VFSDB_VNODE, ("vn_lock: %s\n", vn_path(vp)));
+}
+
+/*
+ * Unlock vnode
+ */
+void
+vn_unlock(struct vnode *vp)
+{
+ ASSERT(vp);
+ ASSERT(vp->v_refcnt > 0);
+ ASSERT(vp->v_nrlocks > 0);
+
+ vp->v_nrlocks--;
+ mutex_unlock(&vp->v_lock);
+ DPRINTF(VFSDB_VNODE, ("vn_lock: %s\n", vn_path(vp)));
+}
+
+/*
+ * Allocate new vnode for specified path.
+ * Increment its reference count and lock it.
+ * Returns 1 if vnode was found in cache; otherwise returns 0.
+ */
+int
+vget(struct mount *mp, uint64_t ino, struct vnode **vpp)
+{
+ struct vnode *vp;
+ int error;
+
+ *vpp = nullptr;
+
+ DPRINTF(VFSDB_VNODE, ("vget %LLu\n", ino));
+
+ VNODE_LOCK();
+
+ vp = vn_lookup(mp, ino);
+ if (vp) {
+ VNODE_UNLOCK();
+ *vpp = vp;
+ return 1;
+ }
+
+ if (!(vp = new vnode())) {
+ VNODE_UNLOCK();
+ return 0;
+ }
+
+ LIST_INIT(&vp->v_names);
+ vp->v_ino = ino;
+ vp->v_mount = mp;
+ vp->v_refcnt = 1;
+ vp->v_op = mp->m_op->vfs_vnops;
+ vp->v_nrlocks = 0;
+
+ /*
+ * Request to allocate fs specific data for vnode.
+ */
+ if ((error = VFS_VGET(mp, vp)) != 0) {
+ VNODE_UNLOCK();
+ delete vp;
+ return error;
+ }
+ vfs_busy(vp->v_mount);
+ mutex_lock(&vp->v_lock);
+ vp->v_nrlocks++;
+
+ LIST_INSERT_HEAD(&vnode_table[vn_hash(mp, ino)], vp, v_link);
+ VNODE_UNLOCK();
+
+ *vpp = vp;
+
+ return 0;
+}
+
+/*
+ * Unlock vnode and decrement its reference count.
+ */
+void
+vput(struct vnode *vp)
+{
+ ASSERT(vp);
+ ASSERT(vp->v_nrlocks > 0);
+ ASSERT(vp->v_refcnt > 0);
+ DPRINTF(VFSDB_VNODE, ("vput: ref=%d %s\n", vp->v_refcnt, vn_path(vp)));
+
+ VNODE_LOCK();
+ vp->v_refcnt--;
+ if (vp->v_refcnt > 0) {
+ VNODE_UNLOCK();
+ vn_unlock(vp);
+ return;
+ }
+ LIST_REMOVE(vp, v_link);
+ VNODE_UNLOCK();
+
+ /*
+ * Deallocate fs specific vnode data
+ */
+ if (vp->v_op->vop_inactive)
+ VOP_INACTIVE(vp);
+ vfs_unbusy(vp->v_mount);
+ vp->v_nrlocks--;
+ ASSERT(vp->v_nrlocks == 0);
+ mutex_unlock(&vp->v_lock);
+ delete vp;
+}
+
+/*
+ * Increment the reference count on an active vnode.
+ */
+void
+vref(struct vnode *vp)
+{
+ ASSERT(vp);
+ ASSERT(vp->v_refcnt > 0); /* Need vget */
+
+ VNODE_LOCK();
+ DPRINTF(VFSDB_VNODE, ("vref: ref=%d\n", vp->v_refcnt));
+ vp->v_refcnt++;
+ VNODE_UNLOCK();
+}
+
+/*
+ * Decrement the reference count of the vnode.
+ * Any code in the system which is using vnode should call vrele()
+ * when it is finished with the vnode.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vrele(struct vnode *vp)
+{
+ ASSERT(vp);
+ ASSERT(vp->v_refcnt > 0);
+
+ VNODE_LOCK();
+ DPRINTF(VFSDB_VNODE, ("vrele: ref=%d\n", vp->v_refcnt));
+ vp->v_refcnt--;
+ if (vp->v_refcnt > 0) {
+ VNODE_UNLOCK();
+ return;
+ }
+ LIST_REMOVE(vp, v_link);
+ VNODE_UNLOCK();
+
+ /*
+ * Deallocate fs specific vnode data
+ */
+ VOP_INACTIVE(vp);
+ vfs_unbusy(vp->v_mount);
+ delete vp;
+}
+
+/*
+ * Remove all vnode in the vnode table for unmount.
+ */
+void
+vflush(struct mount *mp)
+{
+}
+
+int
+vn_stat(struct vnode *vp, struct stat *st)
+{
+ struct vattr vattr;
+ struct vattr *vap;
+ mode_t mode;
+ int error;
+
+ vap = &vattr;
+
+ memset(st, 0, sizeof(struct stat));
+
+ memset(vap, 0, sizeof(struct vattr));
+
+ error = VOP_GETATTR(vp, vap);
+ if (error)
+ return error;
+
+ st->st_ino = (ino_t)vap->va_nodeid;
+ st->st_size = vap->va_size;
+ mode = vap->va_mode;
+ switch (vp->v_type) {
+ case VREG:
+ mode |= S_IFREG;
+ break;
+ case VDIR:
+ mode |= S_IFDIR;
+ break;
+ case VBLK:
+ mode |= S_IFBLK;
+ break;
+ case VCHR:
+ mode |= S_IFCHR;
+ break;
+ case VLNK:
+ mode |= S_IFLNK;
+ break;
+ case VSOCK:
+ mode |= S_IFSOCK;
+ break;
+ case VFIFO:
+ mode |= S_IFIFO;
+ break;
+ default:
+ return EBADF;
+ };
+ st->st_mode = mode;
+ st->st_nlink = vap->va_nlink;
+ st->st_blksize = BSIZE;
+ st->st_blocks = vap->va_size / S_BLKSIZE;
+ st->st_uid = vap->va_uid;
+ st->st_gid = vap->va_gid;
+ st->st_dev = vap->va_fsid;
+ if (vp->v_type == VCHR || vp->v_type == VBLK)
+ st->st_rdev = vap->va_rdev;
+
+ st->st_atim = vap->va_atime;
+ st->st_mtim = vap->va_mtime;
+ st->st_ctim = vap->va_ctime;
+
+ return 0;
+}
+
+/*
+ * Set access and modification times of the vnode
+ */
+int
+vn_settimes(struct vnode *vp, struct timespec times[2])
+{
+ struct vattr vattr;
+ struct vattr *vap;
+ int error;
+
+ vap = &vattr;
+ memset(vap, 0, sizeof(struct vattr));
+
+ vap->va_atime = times[0];
+ vap->va_mtime = times[1];
+ vap->va_mask = ((times[0].tv_nsec == UTIME_OMIT) ? 0 : AT_ATIME)
+ | ((times[1].tv_nsec == UTIME_OMIT) ? 0 : AT_MTIME);
+ vn_lock(vp);
+ error = VOP_SETATTR(vp, vap);
+ vn_unlock(vp);
+
+ return error;
+}
+
+/*
+ * Set chmod permissions on the vnode.
+ */
+int
+vn_setmode(struct vnode *vp, mode_t new_mode)
+{
+ struct vattr vattr;
+ memset(&vattr, 0, sizeof(vattr));
+ vattr.va_mode = new_mode;
+ vattr.va_mask = AT_MODE;
+ vn_lock(vp);
+ vp->v_mode = new_mode;
+ int error = VOP_SETATTR(vp, &vattr);
+ vn_unlock(vp);
+ return error;
+}
+
+/*
+ * Check permission on vnode pointer.
+ */
+int
+vn_access(struct vnode *vp, int flags)
+{
+ int error = 0;
+
+ if ((flags & VEXEC) && (vp->v_mode & 0111) == 0) {
+ error = EACCES;
+ goto out;
+ }
+ if ((flags & VREAD) && (vp->v_mode & 0444) == 0) {
+ error = EACCES;
+ goto out;
+ }
+ if (flags & VWRITE) {
+ if (vp->v_mount->m_flags & MNT_RDONLY) {
+ error = EROFS;
+ goto out;
+ }
+ if ((vp->v_mode & 0222) == 0) {
+ error = EACCES;
+ goto out;
+ }
+ }
+ out:
+ return error;
+}
+
+#ifdef DEBUG_VFS
+/*
+ * Dump all all vnode.
+ */
+void
+vnode_dump(void)
+{
+ int i;
+ struct vnode *vp;
+ struct mount *mp;
+ char type[][6] = { "VNON ", "VREG ", "VDIR ", "VBLK ", "VCHR ",
+ "VLNK ", "VSOCK", "VFIFO" };
+
+ VNODE_LOCK();
+ kprintf("Dump vnode\n");
+ kprintf(" vnode mount type refcnt blkno path\n");
+ kprintf(" -------- -------- ----- ------ --------
------------------------------\n");
+
+ for (i = 0; i < VNODE_BUCKETS; i++) {
+ LIST_FOREACH(vp, &vnode_table[i], v_link) {
+ mp = vp->v_mount;
+
+ kprintf(" %08x %08x %s %6d %8d %s%s\n", (u_long)vp,
+ (u_long)mp, type[vp->v_type], vp->v_refcnt,
+ (strlen(mp->m_path) == 1) ? "\0" : mp->m_path,
+ vn_path(vp));
+ }
+ }
+ kprintf("\n");
+ VNODE_UNLOCK();
+}
+#endif
+
+int
+vop_nullop(void)
+{
+ return 0;
+}
+
+int
+vop_einval(void)
+{
+ return EINVAL;
+}
+
+int
+vop_eperm(void)
+{
+ return EPERM;
+}
+
+int
+vop_erofs(void)
+{
+ return EROFS;
+}
+
+/*
+ * vnode_init() is called once (from vfs_init)
+ * in initialization.
+ */
+void
+vnode_init(void)
+{
+ int i;
+
+ for (i = 0; i < VNODE_BUCKETS; i++)
+ LIST_INIT(&vnode_table[i]);
+}
+
+void vn_add_name(struct vnode *vp, struct dentry *dp)
+{
+ vn_lock(vp);
+ LIST_INSERT_HEAD(&vp->v_names, dp, d_names_link);
+ vn_unlock(vp);
+}
+
+void vn_del_name(struct vnode *vp, struct dentry *dp)
+{
+ vn_lock(vp);
+ LIST_REMOVE(dp, d_names_link);
+ vn_unlock(vp);
+}
+
--
2.19.2
_______________________________________________
Minios-devel mailing list
Minios-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/minios-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |